Mock Version: 3.5 Mock Version: 3.5 Mock Version: 3.5 ENTER ['do_with_status'](['bash', '--login', '-c', '/usr/bin/rpmbuild -bs --noclean --target aarch64 --nodeps /builddir/build/SPECS/python-xformers.spec'], chrootPath='/var/lib/mock/dist-an23-epao-build-354199-71029/root'env={'TERM': 'vt100', 'SHELL': '/bin/bash', 'HOME': '/builddir', 'HOSTNAME': 'mock', 'PATH': '/usr/bin:/bin:/usr/sbin:/sbin', 'PROMPT_COMMAND': 'printf "\\033]0;\\007"', 'PS1': ' \\s-\\v\\$ ', 'LANG': 'C.UTF-8'}shell=Falselogger=timeout=86400uid=982gid=135user='mockbuild'nspawn_args=[]unshare_net=TrueprintOutput=False) Executing command: ['bash', '--login', '-c', '/usr/bin/rpmbuild -bs --noclean --target aarch64 --nodeps /builddir/build/SPECS/python-xformers.spec'] with env {'TERM': 'vt100', 'SHELL': '/bin/bash', 'HOME': '/builddir', 'HOSTNAME': 'mock', 'PATH': '/usr/bin:/bin:/usr/sbin:/sbin', 'PROMPT_COMMAND': 'printf "\\033]0;\\007"', 'PS1': ' \\s-\\v\\$ ', 'LANG': 'C.UTF-8'} and shell False Building target platforms: aarch64 Building for target aarch64 setting SOURCE_DATE_EPOCH=1708214400 Wrote: /builddir/build/SRPMS/python-xformers-0.0.24-1.an23.src.rpm Child return code was: 0 ENTER ['do_with_status'](['bash', '--login', '-c', '/usr/bin/rpmbuild -bb --noclean --target aarch64 --nodeps /builddir/build/SPECS/python-xformers.spec'], chrootPath='/var/lib/mock/dist-an23-epao-build-354199-71029/root'env={'TERM': 'vt100', 'SHELL': '/bin/bash', 'HOME': '/builddir', 'HOSTNAME': 'mock', 'PATH': '/usr/bin:/bin:/usr/sbin:/sbin', 'PROMPT_COMMAND': 'printf "\\033]0;\\007"', 'PS1': ' \\s-\\v\\$ ', 'LANG': 'C.UTF-8'}shell=Falselogger=timeout=86400uid=982gid=135user='mockbuild'nspawn_args=[]unshare_net=TrueprintOutput=False) Executing command: ['bash', '--login', '-c', '/usr/bin/rpmbuild -bb --noclean --target aarch64 --nodeps /builddir/build/SPECS/python-xformers.spec'] with env {'TERM': 'vt100', 'SHELL': '/bin/bash', 'HOME': '/builddir', 'HOSTNAME': 'mock', 'PATH': '/usr/bin:/bin:/usr/sbin:/sbin', 'PROMPT_COMMAND': 'printf "\\033]0;\\007"', 'PS1': ' \\s-\\v\\$ ', 'LANG': 'C.UTF-8'} and shell False Building target platforms: aarch64 Building for target aarch64 setting SOURCE_DATE_EPOCH=1708214400 Executing(%prep): /bin/sh -e /var/tmp/rpm-tmp.l1DleP + umask 022 + cd /builddir/build/BUILD + cd /builddir/build/BUILD + rm -rf xformers-0.0.24 + /usr/lib/rpm/rpmuncompress -x /builddir/build/SOURCES/xformers-0.0.24.tar.gz + STATUS=0 + '[' 0 -ne 0 ']' + cd xformers-0.0.24 + /usr/bin/chmod -Rf a+rX,u+w,g-w,o-w . + rm -rf xformers.egg-info + RPM_EC=0 ++ jobs -p + exit 0 Executing(%build): /bin/sh -e /var/tmp/rpm-tmp.kbVpAm + umask 022 + cd /builddir/build/BUILD + CFLAGS='-O2 -flto=auto -ffat-lto-objects -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection' + export CFLAGS + CXXFLAGS='-O2 -flto=auto -ffat-lto-objects -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection' + export CXXFLAGS + FFLAGS='-O2 -flto=auto -ffat-lto-objects -fexceptions -g -grecord-gcc-switches -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -I/usr/lib64/gfortran/modules' + export FFLAGS + FCFLAGS='-O2 -flto=auto -ffat-lto-objects -fexceptions -g -grecord-gcc-switches -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -I/usr/lib64/gfortran/modules' + export FCFLAGS + LDFLAGS='-Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/anolis/anolis-hardened-ld -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -Wl,--build-id=sha1 ' + export LDFLAGS + LT_SYS_LIBRARY_PATH=/usr/lib64: + export LT_SYS_LIBRARY_PATH + CC=gcc + export CC + CXX=g++ + export CXX + cd xformers-0.0.24 + export 'NVCC_FLAGS= --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all' + NVCC_FLAGS=' --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all' + export FORCE_CUDA=1 + FORCE_CUDA=1 + export MAX_JOBS=80 + MAX_JOBS=80 + export CUDA_HOME=/usr/local/cuda-12.1 + CUDA_HOME=/usr/local/cuda-12.1 + export LD_LIBRARY_PATH=/usr/local/cuda-12.1/lib64/ + LD_LIBRARY_PATH=/usr/local/cuda-12.1/lib64/ + export 'TORCH_CUDA_ARCH_LIST=5.0;5.2;6.0;6.1;7.0;7.5;8.0;8.6;9.0' + TORCH_CUDA_ARCH_LIST='5.0;5.2;6.0;6.1;7.0;7.5;8.0;8.6;9.0' + CFLAGS='-O2 -flto=auto -ffat-lto-objects -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection' + LDFLAGS='-Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/anolis/anolis-hardened-ld -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -Wl,--build-id=sha1 ' + /usr/bin/python3 setup.py build '--executable=/usr/bin/python3 -s' No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda-12.1' fatal: not a git repository (or any of the parent directories): .git Looks like we are using CUDA 12.1 which segfaults when provided with the -generate-line-info flag. Disabling it. Looks like we are using CUDA 12.1 which segfaults when provided with the -generate-line-info flag. Disabling it. running build running build_py creating build creating build/lib.linux-aarch64-cpython-310 creating build/lib.linux-aarch64-cpython-310/xformers copying xformers/__init__.py -> build/lib.linux-aarch64-cpython-310/xformers copying xformers/_cpp_lib.py -> build/lib.linux-aarch64-cpython-310/xformers copying xformers/_deprecation_warning.py -> build/lib.linux-aarch64-cpython-310/xformers copying xformers/attn_bias_utils.py -> build/lib.linux-aarch64-cpython-310/xformers copying xformers/checkpoint.py -> build/lib.linux-aarch64-cpython-310/xformers copying xformers/info.py -> build/lib.linux-aarch64-cpython-310/xformers copying xformers/test.py -> build/lib.linux-aarch64-cpython-310/xformers copying xformers/utils.py -> build/lib.linux-aarch64-cpython-310/xformers creating build/lib.linux-aarch64-cpython-310/xformers/benchmarks copying xformers/benchmarks/__init__.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks copying xformers/benchmarks/benchmark_attn_decoding.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks copying xformers/benchmarks/benchmark_blocksparse_transformers.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks copying xformers/benchmarks/benchmark_causal_blocksparse.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks copying xformers/benchmarks/benchmark_core.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks copying xformers/benchmarks/benchmark_indexing.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks copying xformers/benchmarks/benchmark_mem_eff_attention.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks copying xformers/benchmarks/benchmark_mem_eff_attn_decoder.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks copying xformers/benchmarks/benchmark_mlp.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks copying xformers/benchmarks/benchmark_multi_head_dispatch.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks copying xformers/benchmarks/benchmark_nystrom_utils.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks copying xformers/benchmarks/benchmark_revnet.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks copying xformers/benchmarks/benchmark_sddmm.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks copying xformers/benchmarks/benchmark_sequence_parallel_fused.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks copying xformers/benchmarks/benchmark_sp24.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks copying xformers/benchmarks/benchmark_swiglu.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks copying xformers/benchmarks/benchmark_tiled_matmul.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks copying xformers/benchmarks/benchmark_transformer.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks copying xformers/benchmarks/benchmark_triton_blocksparse.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks copying xformers/benchmarks/benchmark_triton_dropout.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks copying xformers/benchmarks/benchmark_triton_fused_linear.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks copying xformers/benchmarks/benchmark_triton_layernorm.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks copying xformers/benchmarks/benchmark_triton_softmax.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks copying xformers/benchmarks/utils.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks creating build/lib.linux-aarch64-cpython-310/xformers/components copying xformers/components/__init__.py -> build/lib.linux-aarch64-cpython-310/xformers/components copying xformers/components/activations.py -> build/lib.linux-aarch64-cpython-310/xformers/components copying xformers/components/input_projection.py -> build/lib.linux-aarch64-cpython-310/xformers/components copying xformers/components/multi_head_dispatch.py -> build/lib.linux-aarch64-cpython-310/xformers/components copying xformers/components/patch_embedding.py -> build/lib.linux-aarch64-cpython-310/xformers/components copying xformers/components/residual.py -> build/lib.linux-aarch64-cpython-310/xformers/components copying xformers/components/reversible.py -> build/lib.linux-aarch64-cpython-310/xformers/components copying xformers/components/simplicial_embedding.py -> build/lib.linux-aarch64-cpython-310/xformers/components creating build/lib.linux-aarch64-cpython-310/xformers/factory copying xformers/factory/__init__.py -> build/lib.linux-aarch64-cpython-310/xformers/factory copying xformers/factory/block_configs.py -> build/lib.linux-aarch64-cpython-310/xformers/factory copying xformers/factory/block_factory.py -> build/lib.linux-aarch64-cpython-310/xformers/factory copying xformers/factory/hydra_helper.py -> build/lib.linux-aarch64-cpython-310/xformers/factory copying xformers/factory/model_factory.py -> build/lib.linux-aarch64-cpython-310/xformers/factory copying xformers/factory/weight_init.py -> build/lib.linux-aarch64-cpython-310/xformers/factory creating build/lib.linux-aarch64-cpython-310/xformers/helpers copying xformers/helpers/__init__.py -> build/lib.linux-aarch64-cpython-310/xformers/helpers copying xformers/helpers/hierarchical_configs.py -> build/lib.linux-aarch64-cpython-310/xformers/helpers copying xformers/helpers/test_utils.py -> build/lib.linux-aarch64-cpython-310/xformers/helpers copying xformers/helpers/timm_sparse_attention.py -> build/lib.linux-aarch64-cpython-310/xformers/helpers creating build/lib.linux-aarch64-cpython-310/xformers/ops copying xformers/ops/__init__.py -> build/lib.linux-aarch64-cpython-310/xformers/ops copying xformers/ops/common.py -> build/lib.linux-aarch64-cpython-310/xformers/ops copying xformers/ops/differentiable_collectives.py -> build/lib.linux-aarch64-cpython-310/xformers/ops copying xformers/ops/indexing.py -> build/lib.linux-aarch64-cpython-310/xformers/ops copying xformers/ops/modpar_layers.py -> build/lib.linux-aarch64-cpython-310/xformers/ops copying xformers/ops/rmsnorm.py -> build/lib.linux-aarch64-cpython-310/xformers/ops copying xformers/ops/rope_padded.py -> build/lib.linux-aarch64-cpython-310/xformers/ops copying xformers/ops/seqpar.py -> build/lib.linux-aarch64-cpython-310/xformers/ops copying xformers/ops/sequence_parallel_fused_ops.py -> build/lib.linux-aarch64-cpython-310/xformers/ops copying xformers/ops/sp24.py -> build/lib.linux-aarch64-cpython-310/xformers/ops copying xformers/ops/swiglu_op.py -> build/lib.linux-aarch64-cpython-310/xformers/ops copying xformers/ops/tiled_matmul.py -> build/lib.linux-aarch64-cpython-310/xformers/ops copying xformers/ops/unbind.py -> build/lib.linux-aarch64-cpython-310/xformers/ops creating build/lib.linux-aarch64-cpython-310/xformers/profiler copying xformers/profiler/__init__.py -> build/lib.linux-aarch64-cpython-310/xformers/profiler copying xformers/profiler/api.py -> build/lib.linux-aarch64-cpython-310/xformers/profiler copying xformers/profiler/device_limits.py -> build/lib.linux-aarch64-cpython-310/xformers/profiler copying xformers/profiler/profiler.py -> build/lib.linux-aarch64-cpython-310/xformers/profiler copying xformers/profiler/profiler_dcgm.py -> build/lib.linux-aarch64-cpython-310/xformers/profiler copying xformers/profiler/profiler_dcgm_impl.py -> build/lib.linux-aarch64-cpython-310/xformers/profiler copying xformers/profiler/slow_ops_profiler.py -> build/lib.linux-aarch64-cpython-310/xformers/profiler creating build/lib.linux-aarch64-cpython-310/xformers/sparse copying xformers/sparse/__init__.py -> build/lib.linux-aarch64-cpython-310/xformers/sparse copying xformers/sparse/_csr_ops.py -> build/lib.linux-aarch64-cpython-310/xformers/sparse copying xformers/sparse/blocksparse_tensor.py -> build/lib.linux-aarch64-cpython-310/xformers/sparse copying xformers/sparse/csr_tensor.py -> build/lib.linux-aarch64-cpython-310/xformers/sparse copying xformers/sparse/utils.py -> build/lib.linux-aarch64-cpython-310/xformers/sparse creating build/lib.linux-aarch64-cpython-310/xformers/triton copying xformers/triton/__init__.py -> build/lib.linux-aarch64-cpython-310/xformers/triton copying xformers/triton/dropout.py -> build/lib.linux-aarch64-cpython-310/xformers/triton copying xformers/triton/fused_linear_layer.py -> build/lib.linux-aarch64-cpython-310/xformers/triton copying xformers/triton/k_activations.py -> build/lib.linux-aarch64-cpython-310/xformers/triton copying xformers/triton/k_dropout.py -> build/lib.linux-aarch64-cpython-310/xformers/triton copying xformers/triton/k_fused_matmul_bw.py -> build/lib.linux-aarch64-cpython-310/xformers/triton copying xformers/triton/k_fused_matmul_fw.py -> build/lib.linux-aarch64-cpython-310/xformers/triton copying xformers/triton/k_layer_norm.py -> build/lib.linux-aarch64-cpython-310/xformers/triton copying xformers/triton/k_softmax.py -> build/lib.linux-aarch64-cpython-310/xformers/triton copying xformers/triton/layer_norm.py -> build/lib.linux-aarch64-cpython-310/xformers/triton copying xformers/triton/softmax.py -> build/lib.linux-aarch64-cpython-310/xformers/triton copying xformers/triton/vararg_kernel.py -> build/lib.linux-aarch64-cpython-310/xformers/triton creating build/lib.linux-aarch64-cpython-310/xformers/_flash_attn copying xformers/_flash_attn/__init__.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn copying xformers/_flash_attn/bert_padding.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn copying xformers/_flash_attn/flash_attn_interface.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn copying xformers/_flash_attn/flash_attn_triton.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn copying xformers/_flash_attn/flash_attn_triton_og.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn copying xformers/_flash_attn/flash_blocksparse_attention.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn copying xformers/_flash_attn/flash_blocksparse_attn_interface.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn copying xformers/_flash_attn/fused_softmax.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn creating build/lib.linux-aarch64-cpython-310/xformers/benchmarks/LRA copying xformers/benchmarks/LRA/__init__.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks/LRA copying xformers/benchmarks/LRA/batch_fetch_results.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks/LRA copying xformers/benchmarks/LRA/batch_submit.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks/LRA copying xformers/benchmarks/LRA/run_grid_search.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks/LRA copying xformers/benchmarks/LRA/run_tasks.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks/LRA copying xformers/benchmarks/LRA/run_with_submitit.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks/LRA creating build/lib.linux-aarch64-cpython-310/xformers/benchmarks/LRA/code copying xformers/benchmarks/LRA/code/__init__.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks/LRA/code copying xformers/benchmarks/LRA/code/dataset.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks/LRA/code copying xformers/benchmarks/LRA/code/model_wrapper.py -> build/lib.linux-aarch64-cpython-310/xformers/benchmarks/LRA/code creating build/lib.linux-aarch64-cpython-310/xformers/components/attention copying xformers/components/attention/__init__.py -> build/lib.linux-aarch64-cpython-310/xformers/components/attention copying xformers/components/attention/_sputnik_sparse.py -> build/lib.linux-aarch64-cpython-310/xformers/components/attention copying xformers/components/attention/attention_mask.py -> build/lib.linux-aarch64-cpython-310/xformers/components/attention copying xformers/components/attention/attention_patterns.py -> build/lib.linux-aarch64-cpython-310/xformers/components/attention copying xformers/components/attention/base.py -> build/lib.linux-aarch64-cpython-310/xformers/components/attention copying xformers/components/attention/blocksparse.py -> build/lib.linux-aarch64-cpython-310/xformers/components/attention copying xformers/components/attention/compositional.py -> build/lib.linux-aarch64-cpython-310/xformers/components/attention copying xformers/components/attention/core.py -> build/lib.linux-aarch64-cpython-310/xformers/components/attention copying xformers/components/attention/favor.py -> build/lib.linux-aarch64-cpython-310/xformers/components/attention copying xformers/components/attention/fourier_mix.py -> build/lib.linux-aarch64-cpython-310/xformers/components/attention copying xformers/components/attention/global_tokens.py -> build/lib.linux-aarch64-cpython-310/xformers/components/attention copying xformers/components/attention/lambda_layer.py -> build/lib.linux-aarch64-cpython-310/xformers/components/attention copying xformers/components/attention/linformer.py -> build/lib.linux-aarch64-cpython-310/xformers/components/attention copying xformers/components/attention/local.py -> build/lib.linux-aarch64-cpython-310/xformers/components/attention copying xformers/components/attention/nystrom.py -> build/lib.linux-aarch64-cpython-310/xformers/components/attention copying xformers/components/attention/ortho.py -> build/lib.linux-aarch64-cpython-310/xformers/components/attention copying xformers/components/attention/pooling.py -> build/lib.linux-aarch64-cpython-310/xformers/components/attention copying xformers/components/attention/random.py -> build/lib.linux-aarch64-cpython-310/xformers/components/attention copying xformers/components/attention/scaled_dot_product.py -> build/lib.linux-aarch64-cpython-310/xformers/components/attention copying xformers/components/attention/sparsity_config.py -> build/lib.linux-aarch64-cpython-310/xformers/components/attention copying xformers/components/attention/utils.py -> build/lib.linux-aarch64-cpython-310/xformers/components/attention copying xformers/components/attention/visual.py -> build/lib.linux-aarch64-cpython-310/xformers/components/attention creating build/lib.linux-aarch64-cpython-310/xformers/components/feedforward copying xformers/components/feedforward/__init__.py -> build/lib.linux-aarch64-cpython-310/xformers/components/feedforward copying xformers/components/feedforward/base.py -> build/lib.linux-aarch64-cpython-310/xformers/components/feedforward copying xformers/components/feedforward/conv_mlp.py -> build/lib.linux-aarch64-cpython-310/xformers/components/feedforward copying xformers/components/feedforward/fused_mlp.py -> build/lib.linux-aarch64-cpython-310/xformers/components/feedforward copying xformers/components/feedforward/mixture_of_experts.py -> build/lib.linux-aarch64-cpython-310/xformers/components/feedforward copying xformers/components/feedforward/mlp.py -> build/lib.linux-aarch64-cpython-310/xformers/components/feedforward creating build/lib.linux-aarch64-cpython-310/xformers/components/positional_embedding copying xformers/components/positional_embedding/__init__.py -> build/lib.linux-aarch64-cpython-310/xformers/components/positional_embedding copying xformers/components/positional_embedding/base.py -> build/lib.linux-aarch64-cpython-310/xformers/components/positional_embedding copying xformers/components/positional_embedding/param.py -> build/lib.linux-aarch64-cpython-310/xformers/components/positional_embedding copying xformers/components/positional_embedding/rotary.py -> build/lib.linux-aarch64-cpython-310/xformers/components/positional_embedding copying xformers/components/positional_embedding/sine.py -> build/lib.linux-aarch64-cpython-310/xformers/components/positional_embedding copying xformers/components/positional_embedding/vocab.py -> build/lib.linux-aarch64-cpython-310/xformers/components/positional_embedding creating build/lib.linux-aarch64-cpython-310/xformers/components/attention/feature_maps copying xformers/components/attention/feature_maps/__init__.py -> build/lib.linux-aarch64-cpython-310/xformers/components/attention/feature_maps copying xformers/components/attention/feature_maps/base.py -> build/lib.linux-aarch64-cpython-310/xformers/components/attention/feature_maps copying xformers/components/attention/feature_maps/softmax.py -> build/lib.linux-aarch64-cpython-310/xformers/components/attention/feature_maps creating build/lib.linux-aarch64-cpython-310/xformers/ops/_triton copying xformers/ops/_triton/__init__.py -> build/lib.linux-aarch64-cpython-310/xformers/ops/_triton copying xformers/ops/_triton/k_index_select_cat.py -> build/lib.linux-aarch64-cpython-310/xformers/ops/_triton copying xformers/ops/_triton/k_scaled_index_add.py -> build/lib.linux-aarch64-cpython-310/xformers/ops/_triton copying xformers/ops/_triton/rmsnorm_kernels.py -> build/lib.linux-aarch64-cpython-310/xformers/ops/_triton copying xformers/ops/_triton/rope_padded_kernels.py -> build/lib.linux-aarch64-cpython-310/xformers/ops/_triton copying xformers/ops/_triton/sequence_parallel_fused_kernels.py -> build/lib.linux-aarch64-cpython-310/xformers/ops/_triton copying xformers/ops/_triton/tiled_matmul_kernels.py -> build/lib.linux-aarch64-cpython-310/xformers/ops/_triton creating build/lib.linux-aarch64-cpython-310/xformers/ops/fmha copying xformers/ops/fmha/__init__.py -> build/lib.linux-aarch64-cpython-310/xformers/ops/fmha copying xformers/ops/fmha/attn_bias.py -> build/lib.linux-aarch64-cpython-310/xformers/ops/fmha copying xformers/ops/fmha/common.py -> build/lib.linux-aarch64-cpython-310/xformers/ops/fmha copying xformers/ops/fmha/cutlass.py -> build/lib.linux-aarch64-cpython-310/xformers/ops/fmha copying xformers/ops/fmha/decoder.py -> build/lib.linux-aarch64-cpython-310/xformers/ops/fmha copying xformers/ops/fmha/dispatch.py -> build/lib.linux-aarch64-cpython-310/xformers/ops/fmha copying xformers/ops/fmha/flash.py -> build/lib.linux-aarch64-cpython-310/xformers/ops/fmha copying xformers/ops/fmha/small_k.py -> build/lib.linux-aarch64-cpython-310/xformers/ops/fmha copying xformers/ops/fmha/triton.py -> build/lib.linux-aarch64-cpython-310/xformers/ops/fmha copying xformers/ops/fmha/triton_splitk.py -> build/lib.linux-aarch64-cpython-310/xformers/ops/fmha creating build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/layers copying xformers/_flash_attn/layers/__init__.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/layers copying xformers/_flash_attn/layers/patch_embed.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/layers copying xformers/_flash_attn/layers/rotary.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/layers creating build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/losses copying xformers/_flash_attn/losses/__init__.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/losses copying xformers/_flash_attn/losses/cross_entropy.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/losses creating build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/models copying xformers/_flash_attn/models/__init__.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/models copying xformers/_flash_attn/models/baichuan.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/models copying xformers/_flash_attn/models/bert.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/models copying xformers/_flash_attn/models/bigcode.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/models copying xformers/_flash_attn/models/falcon.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/models copying xformers/_flash_attn/models/gpt.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/models copying xformers/_flash_attn/models/gpt_neox.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/models copying xformers/_flash_attn/models/gptj.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/models copying xformers/_flash_attn/models/llama.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/models copying xformers/_flash_attn/models/opt.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/models copying xformers/_flash_attn/models/vit.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/models creating build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/modules copying xformers/_flash_attn/modules/__init__.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/modules copying xformers/_flash_attn/modules/block.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/modules copying xformers/_flash_attn/modules/embedding.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/modules copying xformers/_flash_attn/modules/mha.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/modules copying xformers/_flash_attn/modules/mlp.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/modules creating build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/ops copying xformers/_flash_attn/ops/__init__.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/ops copying xformers/_flash_attn/ops/activations.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/ops copying xformers/_flash_attn/ops/fused_dense.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/ops copying xformers/_flash_attn/ops/layer_norm.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/ops copying xformers/_flash_attn/ops/rms_norm.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/ops creating build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/utils copying xformers/_flash_attn/utils/__init__.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/utils copying xformers/_flash_attn/utils/benchmark.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/utils copying xformers/_flash_attn/utils/distributed.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/utils copying xformers/_flash_attn/utils/generation.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/utils copying xformers/_flash_attn/utils/pretrained.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/utils creating build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/ops/triton copying xformers/_flash_attn/ops/triton/__init__.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/ops/triton copying xformers/_flash_attn/ops/triton/cross_entropy.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/ops/triton copying xformers/_flash_attn/ops/triton/k_activations.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/ops/triton copying xformers/_flash_attn/ops/triton/layernorm.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/ops/triton copying xformers/_flash_attn/ops/triton/linear.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/ops/triton copying xformers/_flash_attn/ops/triton/mlp.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/ops/triton copying xformers/_flash_attn/ops/triton/rotary.py -> build/lib.linux-aarch64-cpython-310/xformers/_flash_attn/ops/triton running build_ext /usr/lib64/python3.10/site-packages/torch/utils/cpp_extension.py:398: UserWarning: There are no g++ version bounds defined for CUDA version 12.1 warnings.warn(f'There are no {compiler_name} version bounds defined for CUDA version {cuda_str_version}') building 'xformers._C_flashattention' extension creating /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310 creating /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir creating /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build creating /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD creating /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24 creating /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party creating /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention creating /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc creating /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn creating /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src Emitting ninja build file /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/build.ninja... Compiling objects... Using envvar MAX_JOBS (80) as the number of workers... [1/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim160_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim160_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc FAILED: /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim160_fp16_sm80.o /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim160_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim160_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h [2/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim224_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim224_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc FAILED: /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim224_fp16_sm80.o /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim224_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim224_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h [3/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim224_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim224_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc FAILED: /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim224_fp16_sm80.o /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim224_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim224_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h [4/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim256_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim256_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc FAILED: /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim256_bf16_sm80.o /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim256_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim256_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h [5/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim224_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim224_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc FAILED: /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim224_bf16_sm80.o /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim224_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim224_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h [6/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim256_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim256_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc FAILED: /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim256_bf16_sm80.o /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim256_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim256_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h [7/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim192_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim192_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc FAILED: /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim192_fp16_sm80.o /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim192_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim192_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h [8/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim192_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim192_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc FAILED: /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim192_bf16_sm80.o /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim192_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim192_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h [9/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim224_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim224_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 251 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 251 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 32 bytes spill stores, 36 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 800 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 32 bytes spill stores, 36 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 67 registers, 800 bytes cmem[0] [10/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim64_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim64_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc FAILED: /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim64_fp16_sm80.o /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim64_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim64_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h [11/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim64_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim64_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc FAILED: /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim64_bf16_sm80.o /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim64_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim64_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h [12/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim192_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim192_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc FAILED: /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim192_fp16_sm80.o /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim192_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim192_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h [13/49] g++ -MMD -MF /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/flash_api.o.d -Wno-unused-result -Wsign-compare -DNDEBUG -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -D_GNU_SOURCE -fPIC -fwrapv -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -D_GNU_SOURCE -fPIC -fwrapv -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -D_GNU_SOURCE -fPIC -fwrapv -O2 -flto=auto -ffat-lto-objects -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -mbranch-protection=standard -fasynchronous-unwind-tables -fstack-clash-protection -fPIC -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/flash_api.cpp -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/flash_api.o -O3 -std=c++17 -fopenmp -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/flash_api.cpp: In function ‘void set_params_fprop(Flash_fwd_params&, size_t, size_t, size_t, size_t, size_t, size_t, size_t, size_t, size_t, at::Tensor, at::Tensor, at::Tensor, at::Tensor, void*, void*, void*, void*, void*, float, float, int, int)’: /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/flash_api.cpp:48:11: warning: ‘void* memset(void*, int, size_t)’ clearing an object of non-trivial type ‘struct Flash_fwd_params’; use assignment or value-initialization instead [-Wclass-memaccess] 48 | memset(¶ms, 0, sizeof(params)); | ~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~ In file included from /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/flash_api.cpp:13: /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash.h:51:8: note: ‘struct Flash_fwd_params’ declared here 51 | struct Flash_fwd_params : public Qkv_params { | ^~~~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/flash_api.cpp: In function ‘std::vector mha_fwd(at::Tensor&, const at::Tensor&, const at::Tensor&, c10::optional&, float, float, bool, int, int, bool, c10::optional)’: /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/flash_api.cpp:347:38: warning: narrowing conversion of ‘(char)(& q)->at::Tensor::.at::TensorBase::get_device()’ from ‘char’ to ‘c10::DeviceIndex’ {aka ‘signed char’} [-Wnarrowing] 347 | at::cuda::CUDAGuard device_guard{(char)q.get_device()}; | ^~~~~~~~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/flash_api.cpp: In function ‘std::vector mha_varlen_fwd(const at::Tensor&, const at::Tensor&, const at::Tensor&, c10::optional&, const at::Tensor&, const at::Tensor&, c10::optional&, int, int, float, float, bool, bool, int, int, bool, c10::optional)’: /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/flash_api.cpp:541:38: warning: narrowing conversion of ‘(char)(& q)->at::Tensor::.at::TensorBase::get_device()’ from ‘char’ to ‘c10::DeviceIndex’ {aka ‘signed char’} [-Wnarrowing] 541 | at::cuda::CUDAGuard device_guard{(char)q.get_device()}; | ^~~~~~~~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/flash_api.cpp: In function ‘std::vector mha_bwd(const at::Tensor&, const at::Tensor&, const at::Tensor&, const at::Tensor&, const at::Tensor&, const at::Tensor&, c10::optional&, c10::optional&, c10::optional&, float, float, bool, int, int, c10::optional, c10::optional&)’: /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/flash_api.cpp:751:38: warning: narrowing conversion of ‘(char)(& q)->at::Tensor::.at::TensorBase::get_device()’ from ‘char’ to ‘c10::DeviceIndex’ {aka ‘signed char’} [-Wnarrowing] 751 | at::cuda::CUDAGuard device_guard{(char)q.get_device()}; | ^~~~~~~~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/flash_api.cpp: In function ‘std::vector mha_varlen_bwd(const at::Tensor&, const at::Tensor&, const at::Tensor&, const at::Tensor&, const at::Tensor&, const at::Tensor&, c10::optional&, c10::optional&, c10::optional&, const at::Tensor&, const at::Tensor&, int, int, float, float, bool, bool, int, int, c10::optional, c10::optional&)’: /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/flash_api.cpp:973:38: warning: narrowing conversion of ‘(char)(& q)->at::Tensor::.at::TensorBase::get_device()’ from ‘char’ to ‘c10::DeviceIndex’ {aka ‘signed char’} [-Wnarrowing] 973 | at::cuda::CUDAGuard device_guard{(char)q.get_device()}; | ^~~~~~~~~~~~~~~~~~~~ /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/flash_api.cpp: In function ‘std::vector mha_fwd_kvcache(at::Tensor&, const at::Tensor&, const at::Tensor&, c10::optional&, c10::optional&, c10::optional&, c10::optional&, c10::optional&, c10::optional&, c10::optional&, float, bool, int, int, bool, int)’: /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/flash_api.cpp:1167:38: warning: narrowing conversion of ‘(char)(& q)->at::Tensor::.at::TensorBase::get_device()’ from ‘char’ to ‘c10::DeviceIndex’ {aka ‘signed char’} [-Wnarrowing] 1167 | at::cuda::CUDAGuard device_guard{(char)q.get_device()}; | ^~~~~~~~~~~~~~~~~~~~ [14/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim192_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim192_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc FAILED: /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim192_bf16_sm80.o /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim192_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim192_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h [15/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim160_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim160_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc FAILED: /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim160_bf16_sm80.o /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim160_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim160_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h [16/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim160_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim160_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc FAILED: /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim160_fp16_sm80.o /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim160_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim160_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h [17/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim160_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim160_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc FAILED: /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim160_bf16_sm80.o /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim160_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim160_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h [18/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim32_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim32_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc FAILED: /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim32_bf16_sm80.o /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim32_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim32_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h [19/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim192_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim192_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc FAILED: /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim192_bf16_sm80.o /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim192_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim192_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h [20/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim32_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim32_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc FAILED: /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim32_fp16_sm80.o /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim32_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim32_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h [21/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim224_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim224_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc FAILED: /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim224_bf16_sm80.o /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim224_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim224_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h [22/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim192_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim192_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc FAILED: /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim192_fp16_sm80.o /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim192_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim192_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h [23/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim96_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim96_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc FAILED: /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim96_fp16_sm80.o /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim96_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim96_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h [24/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim224_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim224_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc FAILED: /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim224_fp16_sm80.o /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim224_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim224_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h [25/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim96_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim96_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc FAILED: /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim96_bf16_sm80.o /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim96_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim96_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h [26/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim32_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim32_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 64 bytes stack frame, 60 bytes spill stores, 64 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 56 bytes stack frame, 52 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 64 bytes stack frame, 60 bytes spill stores, 60 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 64 bytes stack frame, 60 bytes spill stores, 64 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 56 bytes stack frame, 52 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 64 bytes stack frame, 60 bytes spill stores, 60 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 37 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 37 registers, 800 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers [27/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim96_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim96_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc FAILED: /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim96_fp16_sm80.o /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim96_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim96_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h [28/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim96_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim96_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc FAILED: /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim96_bf16_sm80.o /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim96_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim96_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h [29/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim256_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim256_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc FAILED: /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim256_fp16_sm80.o /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim256_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim256_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h [30/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim256_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim256_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc FAILED: /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim256_bf16_sm80.o /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim256_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim256_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h [31/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim96_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim96_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc FAILED: /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim96_bf16_sm80.o /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim96_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim96_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h [32/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim256_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim256_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc FAILED: /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim256_fp16_sm80.o /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim256_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim256_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h [33/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim96_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim96_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc FAILED: /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim96_fp16_sm80.o /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim96_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim96_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h [34/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim32_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim32_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 40 bytes stack frame, 92 bytes spill stores, 64 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 40 bytes stack frame, 60 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 152 bytes stack frame, 216 bytes spill stores, 216 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 16 bytes stack frame, 16 bytes spill stores, 20 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 608 bytes stack frame, 384 bytes spill stores, 448 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 24 bytes stack frame, 36 bytes spill stores, 48 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 1416 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 1016 bytes spill stores, 1262 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 40 bytes stack frame, 80 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 720 bytes stack frame, 580 bytes spill stores, 552 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 80 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 1584 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES3_EELb1ELb0ELb1ELb0ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 1424 bytes spill stores, 1728 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 32 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 1400 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES3_EELb1ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 1128 bytes spill stores, 1322 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 80 bytes stack frame, 120 bytes spill stores, 96 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 160 bytes stack frame, 288 bytes spill stores, 284 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 784 bytes stack frame, 1024 bytes spill stores, 984 bytes spill loads ptxas info : Used 255 registers ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 32 bytes stack frame, 80 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 176 bytes stack frame, 228 bytes spill stores, 216 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 696 bytes stack frame, 476 bytes spill stores, 544 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 64 bytes stack frame, 72 bytes spill stores, 76 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 1384 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 1048 bytes spill stores, 1330 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 112 bytes stack frame, 164 bytes spill stores, 148 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 728 bytes stack frame, 664 bytes spill stores, 636 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 264 bytes stack frame, 436 bytes spill stores, 408 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 1528 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES3_EELb1ELb0ELb1ELb0ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 1336 bytes spill stores, 1614 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 72 bytes stack frame, 72 bytes spill stores, 72 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 1408 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES3_EELb1ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 1044 bytes spill stores, 1272 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 128 bytes stack frame, 184 bytes spill stores, 176 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 136 bytes stack frame, 236 bytes spill stores, 224 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 776 bytes stack frame, 1024 bytes spill stores, 984 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 32 bytes stack frame, 80 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 176 bytes stack frame, 228 bytes spill stores, 216 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 688 bytes stack frame, 492 bytes spill stores, 574 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 72 bytes stack frame, 116 bytes spill stores, 120 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 1376 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 916 bytes spill stores, 1144 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 112 bytes stack frame, 164 bytes spill stores, 148 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 728 bytes stack frame, 664 bytes spill stores, 636 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 264 bytes stack frame, 436 bytes spill stores, 408 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 1528 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES3_EELb1ELb0ELb1ELb0ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 1340 bytes spill stores, 1618 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 64 bytes stack frame, 64 bytes spill stores, 76 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 1368 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES3_EELb1ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 1008 bytes spill stores, 1182 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 128 bytes stack frame, 184 bytes spill stores, 176 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 136 bytes stack frame, 236 bytes spill stores, 224 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 776 bytes stack frame, 1024 bytes spill stores, 984 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] [35/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim32_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim32_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 32 bytes stack frame, 80 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 176 bytes stack frame, 228 bytes spill stores, 216 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 152 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 64 bytes stack frame, 72 bytes spill stores, 76 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 576 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 288 bytes spill stores, 352 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 112 bytes stack frame, 164 bytes spill stores, 148 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 304 bytes stack frame, 346 bytes spill stores, 328 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 264 bytes stack frame, 436 bytes spill stores, 408 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 736 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES3_EELb1ELb0ELb1ELb0ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 488 bytes spill stores, 464 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 72 bytes stack frame, 72 bytes spill stores, 72 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 632 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES3_EELb1ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 332 bytes spill stores, 380 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 128 bytes stack frame, 176 bytes spill stores, 168 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 136 bytes stack frame, 236 bytes spill stores, 224 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 632 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES3_EELb1ELb1ELb0ELb0ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 388 bytes spill stores, 392 bytes spill loads ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 40 bytes stack frame, 92 bytes spill stores, 64 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 40 bytes stack frame, 60 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 152 bytes stack frame, 216 bytes spill stores, 216 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 16 bytes stack frame, 16 bytes spill stores, 20 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 24 bytes stack frame, 36 bytes spill stores, 48 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 512 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 224 bytes spill stores, 268 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 40 bytes stack frame, 80 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 280 bytes stack frame, 316 bytes spill stores, 300 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 80 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 848 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES3_EELb1ELb0ELb1ELb0ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 632 bytes spill stores, 672 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 32 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 592 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES3_EELb1ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 340 bytes spill stores, 384 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 80 bytes stack frame, 116 bytes spill stores, 100 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 160 bytes stack frame, 288 bytes spill stores, 284 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 776 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES3_EELb1ELb1ELb0ELb0ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 612 bytes spill stores, 652 bytes spill loads ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 32 bytes stack frame, 80 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 176 bytes stack frame, 228 bytes spill stores, 216 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 112 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 72 bytes stack frame, 116 bytes spill stores, 120 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 496 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 188 bytes spill stores, 244 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 112 bytes stack frame, 164 bytes spill stores, 148 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 304 bytes stack frame, 346 bytes spill stores, 328 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 264 bytes stack frame, 436 bytes spill stores, 408 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 736 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES3_EELb1ELb0ELb1ELb0ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 488 bytes spill stores, 464 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 64 bytes stack frame, 64 bytes spill stores, 76 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 536 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES3_EELb1ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 228 bytes spill stores, 276 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 128 bytes stack frame, 176 bytes spill stores, 168 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 136 bytes stack frame, 236 bytes spill stores, 224 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 640 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES3_EELb1ELb1ELb0ELb0ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 392 bytes spill stores, 396 bytes spill loads [36/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim32_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim32_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 37 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 29 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 37 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 29 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 251 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 251 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 229 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 29 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 29 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 251 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 241 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 229 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 8704 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 31 registers, 4352 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 25 registers, 2176 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 1088 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 21 registers, 544 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 21 registers, 272 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 21 registers, 144 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 8704 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers, 4352 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 25 registers, 2176 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 1088 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 544 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 272 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 144 bytes smem ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 239 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 237 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 16 bytes stack frame, 12 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 243 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 245 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 249 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers [37/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim128_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim128_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc FAILED: /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim128_bf16_sm80.o /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim128_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim128_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h [38/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim128_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim128_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc FAILED: /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim128_bf16_sm80.o /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim128_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim128_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h [39/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim128_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim128_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc FAILED: /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim128_fp16_sm80.o /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim128_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim128_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h [40/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim128_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim128_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc FAILED: /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim128_fp16_sm80.o /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim128_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim128_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h [41/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim64_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim64_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc FAILED: /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim64_bf16_sm80.o /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim64_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim64_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h [42/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim64_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim64_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc FAILED: /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim64_bf16_sm80.o /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim64_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim64_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h [43/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim128_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim128_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc FAILED: /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim128_bf16_sm80.o /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim128_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim128_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h [44/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim64_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim64_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc FAILED: /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim64_fp16_sm80.o /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim64_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim64_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h [45/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim128_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim128_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc FAILED: /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim128_fp16_sm80.o /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim128_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim128_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h [46/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim64_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim64_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc FAILED: /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim64_fp16_sm80.o /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim64_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim64_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h [47/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim256_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim256_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc FAILED: /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim256_fp16_sm80.o /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim256_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim256_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h [48/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim160_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim160_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc FAILED: /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim160_bf16_sm80.o /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim160_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim160_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h [49/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim160_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.24/build/temp.linux-aarch64-cpython-310/builddir/build/BUILD/xformers-0.0.24/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim160_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 24 bytes stack frame, 40 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 32 bytes stack frame, 116 bytes spill stores, 72 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 4 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 40 bytes stack frame, 184 bytes spill stores, 108 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 201 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 184 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 195 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 64 bytes stack frame, 156 bytes spill stores, 124 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 40 bytes stack frame, 128 bytes spill stores, 88 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 178 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 186 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 24 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 160 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 96 bytes stack frame, 212 bytes spill stores, 244 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 664 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 774 bytes spill stores, 1342 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 48 bytes stack frame, 76 bytes spill stores, 60 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 128 bytes stack frame, 178 bytes spill stores, 162 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 80 bytes stack frame, 240 bytes spill stores, 192 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 664 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES3_EELb1ELb0ELb1ELb0ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 846 bytes spill stores, 1098 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 188 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 200 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 208 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 52 bytes spill stores, 62 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 216 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 152 bytes stack frame, 44 bytes spill stores, 106 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 96 bytes stack frame, 204 bytes spill stores, 232 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 640 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES3_EELb1ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 678 bytes spill stores, 1222 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 64 bytes stack frame, 192 bytes spill stores, 180 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 248 bytes stack frame, 462 bytes spill stores, 614 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 210 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 190 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 144 bytes stack frame, 40 bytes spill stores, 96 bytes spill loads ptxas info : Used 255 registers ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 32 bytes stack frame, 60 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 56 bytes stack frame, 136 bytes spill stores, 108 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 32 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 104 bytes spill stores, 64 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 202 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 184 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 187 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 56 bytes stack frame, 136 bytes spill stores, 116 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 72 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 204 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 204 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 48 bytes stack frame, 76 bytes spill stores, 68 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 144 bytes stack frame, 116 bytes spill stores, 112 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 128 bytes stack frame, 164 bytes spill stores, 324 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 728 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 720 bytes spill stores, 1372 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 56 bytes stack frame, 84 bytes spill stores, 68 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 128 bytes stack frame, 178 bytes spill stores, 162 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 72 bytes stack frame, 176 bytes spill stores, 200 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 752 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES3_EELb1ELb0ELb1ELb0ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 768 bytes spill stores, 1172 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 180 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 209 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 16 bytes spill stores, 50 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 184 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 16 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 128 bytes stack frame, 208 bytes spill stores, 304 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 688 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES3_EELb1ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 698 bytes spill stores, 1186 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 144 bytes stack frame, 308 bytes spill stores, 376 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 152 bytes stack frame, 466 bytes spill stores, 450 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 220 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 208 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 32 bytes stack frame, 60 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 40 bytes stack frame, 120 bytes spill stores, 76 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 32 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 104 bytes spill stores, 64 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 202 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 184 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 187 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 64 bytes stack frame, 160 bytes spill stores, 140 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 72 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 185 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 204 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 40 bytes stack frame, 64 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 160 bytes stack frame, 120 bytes spill stores, 116 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 96 bytes stack frame, 176 bytes spill stores, 224 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 728 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 728 bytes spill stores, 1288 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 56 bytes stack frame, 84 bytes spill stores, 68 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 128 bytes stack frame, 178 bytes spill stores, 162 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 80 bytes stack frame, 184 bytes spill stores, 208 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 752 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES3_EELb1ELb0ELb1ELb0ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 768 bytes spill stores, 1172 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 180 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 209 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 16 bytes spill stores, 50 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 184 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 16 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 128 bytes stack frame, 212 bytes spill stores, 240 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 648 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES3_EELb1ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 658 bytes spill stores, 1138 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 144 bytes stack frame, 308 bytes spill stores, 376 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 152 bytes stack frame, 466 bytes spill stores, 450 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 220 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 208 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ninja: build stopped: subcommand failed. Traceback (most recent call last): File "/usr/lib64/python3.10/site-packages/torch/utils/cpp_extension.py", line 1893, in _run_ninja_build subprocess.run( File "/usr/lib64/python3.10/subprocess.py", line 526, in run raise CalledProcessError(retcode, process.args, subprocess.CalledProcessError: Command '['ninja', '-v', '-j', '80']' returned non-zero exit status 1. The above exception was the direct cause of the following exception: Traceback (most recent call last): File "/builddir/build/BUILD/xformers-0.0.24/setup.py", line 405, in setuptools.setup( File "/usr/lib/python3.10/site-packages/setuptools/__init__.py", line 87, in setup return distutils.core.setup(**attrs) File "/usr/lib/python3.10/site-packages/setuptools/_distutils/core.py", line 185, in setup return run_commands(dist) File "/usr/lib/python3.10/site-packages/setuptools/_distutils/core.py", line 201, in run_commands dist.run_commands() File "/usr/lib/python3.10/site-packages/setuptools/_distutils/dist.py", line 968, in run_commands self.run_command(cmd) File "/usr/lib/python3.10/site-packages/setuptools/dist.py", line 1217, in run_command super().run_command(command) File "/usr/lib/python3.10/site-packages/setuptools/_distutils/dist.py", line 987, in run_command cmd_obj.run() File "/usr/lib/python3.10/site-packages/setuptools/_distutils/command/build.py", line 132, in run self.run_command(cmd_name) File "/usr/lib/python3.10/site-packages/setuptools/_distutils/cmd.py", line 319, in run_command self.distribution.run_command(command) File "/usr/lib/python3.10/site-packages/setuptools/dist.py", line 1217, in run_command super().run_command(command) File "/usr/lib/python3.10/site-packages/setuptools/_distutils/dist.py", line 987, in run_command cmd_obj.run() File "/usr/lib/python3.10/site-packages/setuptools/command/build_ext.py", line 84, in run _build_ext.run(self) File "/usr/lib/python3.10/site-packages/setuptools/_distutils/command/build_ext.py", line 346, in run self.build_extensions() File "/builddir/build/BUILD/xformers-0.0.24/setup.py", line 361, in build_extensions super().build_extensions() File "/usr/lib64/python3.10/site-packages/torch/utils/cpp_extension.py", line 843, in build_extensions build_ext.build_extensions(self) File "/usr/lib/python3.10/site-packages/setuptools/_distutils/command/build_ext.py", line 466, in build_extensions self._build_extensions_serial() File "/usr/lib/python3.10/site-packages/setuptools/_distutils/command/build_ext.py", line 492, in _build_extensions_serial self.build_extension(ext) File "/usr/lib/python3.10/site-packages/setuptools/command/build_ext.py", line 246, in build_extension _build_ext.build_extension(self, ext) File "/usr/lib/python3.10/site-packages/setuptools/_distutils/command/build_ext.py", line 547, in build_extension objects = self.compiler.compile( File "/usr/lib64/python3.10/site-packages/torch/utils/cpp_extension.py", line 658, in unix_wrap_ninja_compile _write_ninja_file_and_compile_objects( File "/usr/lib64/python3.10/site-packages/torch/utils/cpp_extension.py", line 1574, in _write_ninja_file_and_compile_objects _run_ninja_build( File "/usr/lib64/python3.10/site-packages/torch/utils/cpp_extension.py", line 1909, in _run_ninja_build raise RuntimeError(message) from e RuntimeError: Error compiling objects for extension error: Bad exit status from /var/tmp/rpm-tmp.kbVpAm (%build) RPM build errors: Bad exit status from /var/tmp/rpm-tmp.kbVpAm (%build) Child return code was: 1 EXCEPTION: [Error('Command failed: \n # bash --login -c /usr/bin/rpmbuild -bb --noclean --target aarch64 --nodeps /builddir/build/SPECS/python-xformers.spec\n', 1)] Traceback (most recent call last): File "/usr/lib/python3.6/site-packages/mockbuild/trace_decorator.py", line 93, in trace result = func(*args, **kw) File "/usr/lib/python3.6/site-packages/mockbuild/util.py", line 598, in do_with_status raise exception.Error("Command failed: \n # %s\n%s" % (command, output), child.returncode) mockbuild.exception.Error: Command failed: # bash --login -c /usr/bin/rpmbuild -bb --noclean --target aarch64 --nodeps /builddir/build/SPECS/python-xformers.spec