Mock Version: 2.12 Mock Version: 2.12 Mock Version: 2.12 ENTER ['do_with_status'](['bash', '--login', '-c', '/usr/bin/rpmbuild -bs --target x86_64 --nodeps /builddir/build/SPECS/python-xformers.spec'], chrootPath='/var/lib/mock/dist-an23-epao-build-354286-71029/root'env={'TERM': 'vt100', 'SHELL': '/bin/bash', 'HOME': '/builddir', 'HOSTNAME': 'mock', 'PATH': '/usr/bin:/bin:/usr/sbin:/sbin', 'PROMPT_COMMAND': 'printf "\\033]0;\\007"', 'PS1': ' \\s-\\v\\$ ', 'LANG': 'C.UTF-8'}shell=Falselogger=timeout=86400uid=989gid=135user='mockbuild'nspawn_args=[]unshare_net=TrueprintOutput=False) Executing command: ['bash', '--login', '-c', '/usr/bin/rpmbuild -bs --target x86_64 --nodeps /builddir/build/SPECS/python-xformers.spec'] with env {'TERM': 'vt100', 'SHELL': '/bin/bash', 'HOME': '/builddir', 'HOSTNAME': 'mock', 'PATH': '/usr/bin:/bin:/usr/sbin:/sbin', 'PROMPT_COMMAND': 'printf "\\033]0;\\007"', 'PS1': ' \\s-\\v\\$ ', 'LANG': 'C.UTF-8'} and shell False Building target platforms: x86_64 Building for target x86_64 setting SOURCE_DATE_EPOCH=1708214400 Wrote: /builddir/build/SRPMS/python-xformers-0.0.23.post1-1.an23.src.rpm Child return code was: 0 ENTER ['do_with_status'](['bash', '--login', '-c', '/usr/bin/rpmbuild -bb --target x86_64 --nodeps /builddir/build/SPECS/python-xformers.spec'], chrootPath='/var/lib/mock/dist-an23-epao-build-354286-71029/root'env={'TERM': 'vt100', 'SHELL': '/bin/bash', 'HOME': '/builddir', 'HOSTNAME': 'mock', 'PATH': '/usr/bin:/bin:/usr/sbin:/sbin', 'PROMPT_COMMAND': 'printf "\\033]0;\\007"', 'PS1': ' \\s-\\v\\$ ', 'LANG': 'C.UTF-8'}shell=Falselogger=timeout=86400uid=989gid=135user='mockbuild'nspawn_args=[]unshare_net=TrueprintOutput=False) Executing command: ['bash', '--login', '-c', '/usr/bin/rpmbuild -bb --target x86_64 --nodeps /builddir/build/SPECS/python-xformers.spec'] with env {'TERM': 'vt100', 'SHELL': '/bin/bash', 'HOME': '/builddir', 'HOSTNAME': 'mock', 'PATH': '/usr/bin:/bin:/usr/sbin:/sbin', 'PROMPT_COMMAND': 'printf "\\033]0;\\007"', 'PS1': ' \\s-\\v\\$ ', 'LANG': 'C.UTF-8'} and shell False Building target platforms: x86_64 Building for target x86_64 setting SOURCE_DATE_EPOCH=1708214400 Executing(%prep): /bin/sh -e /var/tmp/rpm-tmp.lcm99z + umask 022 + cd /builddir/build/BUILD + cd /builddir/build/BUILD + rm -rf xformers-0.0.23.post1 + /usr/lib/rpm/rpmuncompress -x /builddir/build/SOURCES/xformers-0.0.23.post1.tar.gz + STATUS=0 + '[' 0 -ne 0 ']' + cd xformers-0.0.23.post1 + /usr/bin/chmod -Rf a+rX,u+w,g-w,o-w . + rm -rf xformers.egg-info + sed -i /torch/d requirements.txt + RPM_EC=0 ++ jobs -p + exit 0 Executing(%build): /bin/sh -e /var/tmp/rpm-tmp.MsbzA5 + umask 022 + cd /builddir/build/BUILD + CFLAGS='-O2 -flto=auto -ffat-lto-objects -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection' + export CFLAGS + CXXFLAGS='-O2 -flto=auto -ffat-lto-objects -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection' + export CXXFLAGS + FFLAGS='-O2 -flto=auto -ffat-lto-objects -fexceptions -g -grecord-gcc-switches -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -I/usr/lib64/gfortran/modules' + export FFLAGS + FCFLAGS='-O2 -flto=auto -ffat-lto-objects -fexceptions -g -grecord-gcc-switches -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -I/usr/lib64/gfortran/modules' + export FCFLAGS + LDFLAGS='-Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/anolis/anolis-hardened-ld -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -Wl,--build-id=sha1 ' + export LDFLAGS + LT_SYS_LIBRARY_PATH=/usr/lib64: + export LT_SYS_LIBRARY_PATH + CC=gcc + export CC + CXX=g++ + export CXX + cd xformers-0.0.23.post1 + export 'NVCC_FLAGS= --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all' + NVCC_FLAGS=' --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all' + export FORCE_CUDA=1 + FORCE_CUDA=1 + export MAX_JOBS=4 + MAX_JOBS=4 + export CUDA_HOME=/usr/local/cuda-12.1 + CUDA_HOME=/usr/local/cuda-12.1 + export LD_LIBRARY_PATH=/usr/local/cuda-12.1/lib64/ + LD_LIBRARY_PATH=/usr/local/cuda-12.1/lib64/ + export 'TORCH_CUDA_ARCH_LIST=5.0;5.2;6.0;6.1;7.0;7.5;8.0;8.6;9.0' + TORCH_CUDA_ARCH_LIST='5.0;5.2;6.0;6.1;7.0;7.5;8.0;8.6;9.0' + CFLAGS='-O2 -flto=auto -ffat-lto-objects -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection' + LDFLAGS='-Wl,-z,relro -Wl,--as-needed -Wl,-z,now -specs=/usr/lib/rpm/anolis/anolis-hardened-ld -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -Wl,--build-id=sha1 ' + /usr/bin/python3 setup.py build '--executable=/usr/bin/python3 -s' No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda-12.1' fatal: not a git repository (or any of the parent directories): .git running build running build_py creating build creating build/lib.linux-x86_64-cpython-310 creating build/lib.linux-x86_64-cpython-310/xformers copying xformers/__init__.py -> build/lib.linux-x86_64-cpython-310/xformers copying xformers/_cpp_lib.py -> build/lib.linux-x86_64-cpython-310/xformers copying xformers/_deprecation_warning.py -> build/lib.linux-x86_64-cpython-310/xformers copying xformers/attn_bias_utils.py -> build/lib.linux-x86_64-cpython-310/xformers copying xformers/checkpoint.py -> build/lib.linux-x86_64-cpython-310/xformers copying xformers/info.py -> build/lib.linux-x86_64-cpython-310/xformers copying xformers/test.py -> build/lib.linux-x86_64-cpython-310/xformers copying xformers/utils.py -> build/lib.linux-x86_64-cpython-310/xformers creating build/lib.linux-x86_64-cpython-310/xformers/benchmarks copying xformers/benchmarks/__init__.py -> build/lib.linux-x86_64-cpython-310/xformers/benchmarks copying xformers/benchmarks/benchmark_attn_decoding.py -> build/lib.linux-x86_64-cpython-310/xformers/benchmarks copying xformers/benchmarks/benchmark_blocksparse_transformers.py -> build/lib.linux-x86_64-cpython-310/xformers/benchmarks copying xformers/benchmarks/benchmark_causal_blocksparse.py -> build/lib.linux-x86_64-cpython-310/xformers/benchmarks copying xformers/benchmarks/benchmark_core.py -> build/lib.linux-x86_64-cpython-310/xformers/benchmarks copying xformers/benchmarks/benchmark_indexing.py -> build/lib.linux-x86_64-cpython-310/xformers/benchmarks copying xformers/benchmarks/benchmark_mem_eff_attention.py -> build/lib.linux-x86_64-cpython-310/xformers/benchmarks copying xformers/benchmarks/benchmark_mem_eff_attn_decoder.py -> build/lib.linux-x86_64-cpython-310/xformers/benchmarks copying xformers/benchmarks/benchmark_mlp.py -> build/lib.linux-x86_64-cpython-310/xformers/benchmarks copying xformers/benchmarks/benchmark_multi_head_dispatch.py -> build/lib.linux-x86_64-cpython-310/xformers/benchmarks copying xformers/benchmarks/benchmark_nystrom_utils.py -> build/lib.linux-x86_64-cpython-310/xformers/benchmarks copying xformers/benchmarks/benchmark_revnet.py -> build/lib.linux-x86_64-cpython-310/xformers/benchmarks copying xformers/benchmarks/benchmark_sddmm.py -> build/lib.linux-x86_64-cpython-310/xformers/benchmarks copying xformers/benchmarks/benchmark_swiglu.py -> build/lib.linux-x86_64-cpython-310/xformers/benchmarks copying xformers/benchmarks/benchmark_transformer.py -> build/lib.linux-x86_64-cpython-310/xformers/benchmarks copying xformers/benchmarks/benchmark_triton_blocksparse.py -> build/lib.linux-x86_64-cpython-310/xformers/benchmarks copying xformers/benchmarks/benchmark_triton_dropout.py -> build/lib.linux-x86_64-cpython-310/xformers/benchmarks copying xformers/benchmarks/benchmark_triton_fused_linear.py -> build/lib.linux-x86_64-cpython-310/xformers/benchmarks copying xformers/benchmarks/benchmark_triton_layernorm.py -> build/lib.linux-x86_64-cpython-310/xformers/benchmarks copying xformers/benchmarks/benchmark_triton_softmax.py -> build/lib.linux-x86_64-cpython-310/xformers/benchmarks copying xformers/benchmarks/utils.py -> build/lib.linux-x86_64-cpython-310/xformers/benchmarks creating build/lib.linux-x86_64-cpython-310/xformers/components copying xformers/components/__init__.py -> build/lib.linux-x86_64-cpython-310/xformers/components copying xformers/components/activations.py -> build/lib.linux-x86_64-cpython-310/xformers/components copying xformers/components/input_projection.py -> build/lib.linux-x86_64-cpython-310/xformers/components copying xformers/components/multi_head_dispatch.py -> build/lib.linux-x86_64-cpython-310/xformers/components copying xformers/components/patch_embedding.py -> build/lib.linux-x86_64-cpython-310/xformers/components copying xformers/components/residual.py -> build/lib.linux-x86_64-cpython-310/xformers/components copying xformers/components/reversible.py -> build/lib.linux-x86_64-cpython-310/xformers/components copying xformers/components/simplicial_embedding.py -> build/lib.linux-x86_64-cpython-310/xformers/components creating build/lib.linux-x86_64-cpython-310/xformers/factory copying xformers/factory/__init__.py -> build/lib.linux-x86_64-cpython-310/xformers/factory copying xformers/factory/block_configs.py -> build/lib.linux-x86_64-cpython-310/xformers/factory copying xformers/factory/block_factory.py -> build/lib.linux-x86_64-cpython-310/xformers/factory copying xformers/factory/hydra_helper.py -> build/lib.linux-x86_64-cpython-310/xformers/factory copying xformers/factory/model_factory.py -> build/lib.linux-x86_64-cpython-310/xformers/factory copying xformers/factory/weight_init.py -> build/lib.linux-x86_64-cpython-310/xformers/factory creating build/lib.linux-x86_64-cpython-310/xformers/helpers copying xformers/helpers/__init__.py -> build/lib.linux-x86_64-cpython-310/xformers/helpers copying xformers/helpers/hierarchical_configs.py -> build/lib.linux-x86_64-cpython-310/xformers/helpers copying xformers/helpers/test_utils.py -> build/lib.linux-x86_64-cpython-310/xformers/helpers copying xformers/helpers/timm_sparse_attention.py -> build/lib.linux-x86_64-cpython-310/xformers/helpers creating build/lib.linux-x86_64-cpython-310/xformers/ops copying xformers/ops/__init__.py -> build/lib.linux-x86_64-cpython-310/xformers/ops copying xformers/ops/common.py -> build/lib.linux-x86_64-cpython-310/xformers/ops copying xformers/ops/indexing.py -> build/lib.linux-x86_64-cpython-310/xformers/ops copying xformers/ops/rmsnorm.py -> build/lib.linux-x86_64-cpython-310/xformers/ops copying xformers/ops/rope_padded.py -> build/lib.linux-x86_64-cpython-310/xformers/ops copying xformers/ops/swiglu_op.py -> build/lib.linux-x86_64-cpython-310/xformers/ops copying xformers/ops/unbind.py -> build/lib.linux-x86_64-cpython-310/xformers/ops creating build/lib.linux-x86_64-cpython-310/xformers/profiler copying xformers/profiler/__init__.py -> build/lib.linux-x86_64-cpython-310/xformers/profiler copying xformers/profiler/api.py -> build/lib.linux-x86_64-cpython-310/xformers/profiler copying xformers/profiler/device_limits.py -> build/lib.linux-x86_64-cpython-310/xformers/profiler copying xformers/profiler/profiler.py -> build/lib.linux-x86_64-cpython-310/xformers/profiler copying xformers/profiler/profiler_dcgm.py -> build/lib.linux-x86_64-cpython-310/xformers/profiler copying xformers/profiler/profiler_dcgm_impl.py -> build/lib.linux-x86_64-cpython-310/xformers/profiler copying xformers/profiler/slow_ops_profiler.py -> build/lib.linux-x86_64-cpython-310/xformers/profiler creating build/lib.linux-x86_64-cpython-310/xformers/sparse copying xformers/sparse/__init__.py -> build/lib.linux-x86_64-cpython-310/xformers/sparse copying xformers/sparse/_csr_ops.py -> build/lib.linux-x86_64-cpython-310/xformers/sparse copying xformers/sparse/blocksparse_tensor.py -> build/lib.linux-x86_64-cpython-310/xformers/sparse copying xformers/sparse/csr_tensor.py -> build/lib.linux-x86_64-cpython-310/xformers/sparse copying xformers/sparse/utils.py -> build/lib.linux-x86_64-cpython-310/xformers/sparse creating build/lib.linux-x86_64-cpython-310/xformers/triton copying xformers/triton/__init__.py -> build/lib.linux-x86_64-cpython-310/xformers/triton copying xformers/triton/dropout.py -> build/lib.linux-x86_64-cpython-310/xformers/triton copying xformers/triton/fused_linear_layer.py -> build/lib.linux-x86_64-cpython-310/xformers/triton copying xformers/triton/k_activations.py -> build/lib.linux-x86_64-cpython-310/xformers/triton copying xformers/triton/k_dropout.py -> build/lib.linux-x86_64-cpython-310/xformers/triton copying xformers/triton/k_fused_matmul_bw.py -> build/lib.linux-x86_64-cpython-310/xformers/triton copying xformers/triton/k_fused_matmul_fw.py -> build/lib.linux-x86_64-cpython-310/xformers/triton copying xformers/triton/k_layer_norm.py -> build/lib.linux-x86_64-cpython-310/xformers/triton copying xformers/triton/k_softmax.py -> build/lib.linux-x86_64-cpython-310/xformers/triton copying xformers/triton/layer_norm.py -> build/lib.linux-x86_64-cpython-310/xformers/triton copying xformers/triton/softmax.py -> build/lib.linux-x86_64-cpython-310/xformers/triton copying xformers/triton/utils.py -> build/lib.linux-x86_64-cpython-310/xformers/triton copying xformers/triton/vararg_kernel.py -> build/lib.linux-x86_64-cpython-310/xformers/triton creating build/lib.linux-x86_64-cpython-310/xformers/_flash_attn copying xformers/_flash_attn/__init__.py -> build/lib.linux-x86_64-cpython-310/xformers/_flash_attn copying xformers/_flash_attn/bert_padding.py -> build/lib.linux-x86_64-cpython-310/xformers/_flash_attn copying xformers/_flash_attn/flash_attn_interface.py -> build/lib.linux-x86_64-cpython-310/xformers/_flash_attn copying xformers/_flash_attn/flash_attn_triton.py -> build/lib.linux-x86_64-cpython-310/xformers/_flash_attn copying xformers/_flash_attn/flash_attn_triton_og.py -> build/lib.linux-x86_64-cpython-310/xformers/_flash_attn copying xformers/_flash_attn/flash_blocksparse_attention.py -> build/lib.linux-x86_64-cpython-310/xformers/_flash_attn copying xformers/_flash_attn/flash_blocksparse_attn_interface.py -> build/lib.linux-x86_64-cpython-310/xformers/_flash_attn copying xformers/_flash_attn/fused_softmax.py -> build/lib.linux-x86_64-cpython-310/xformers/_flash_attn creating build/lib.linux-x86_64-cpython-310/xformers/benchmarks/LRA copying xformers/benchmarks/LRA/__init__.py -> build/lib.linux-x86_64-cpython-310/xformers/benchmarks/LRA copying xformers/benchmarks/LRA/batch_fetch_results.py -> build/lib.linux-x86_64-cpython-310/xformers/benchmarks/LRA copying xformers/benchmarks/LRA/batch_submit.py -> build/lib.linux-x86_64-cpython-310/xformers/benchmarks/LRA copying xformers/benchmarks/LRA/run_grid_search.py -> build/lib.linux-x86_64-cpython-310/xformers/benchmarks/LRA copying xformers/benchmarks/LRA/run_tasks.py -> build/lib.linux-x86_64-cpython-310/xformers/benchmarks/LRA copying xformers/benchmarks/LRA/run_with_submitit.py -> build/lib.linux-x86_64-cpython-310/xformers/benchmarks/LRA creating build/lib.linux-x86_64-cpython-310/xformers/benchmarks/LRA/code copying xformers/benchmarks/LRA/code/__init__.py -> build/lib.linux-x86_64-cpython-310/xformers/benchmarks/LRA/code copying xformers/benchmarks/LRA/code/dataset.py -> build/lib.linux-x86_64-cpython-310/xformers/benchmarks/LRA/code copying xformers/benchmarks/LRA/code/model_wrapper.py -> build/lib.linux-x86_64-cpython-310/xformers/benchmarks/LRA/code creating build/lib.linux-x86_64-cpython-310/xformers/components/attention copying xformers/components/attention/__init__.py -> build/lib.linux-x86_64-cpython-310/xformers/components/attention copying xformers/components/attention/_sputnik_sparse.py -> build/lib.linux-x86_64-cpython-310/xformers/components/attention copying xformers/components/attention/attention_mask.py -> build/lib.linux-x86_64-cpython-310/xformers/components/attention copying xformers/components/attention/attention_patterns.py -> build/lib.linux-x86_64-cpython-310/xformers/components/attention copying xformers/components/attention/base.py -> build/lib.linux-x86_64-cpython-310/xformers/components/attention copying xformers/components/attention/blocksparse.py -> build/lib.linux-x86_64-cpython-310/xformers/components/attention copying xformers/components/attention/compositional.py -> build/lib.linux-x86_64-cpython-310/xformers/components/attention copying xformers/components/attention/core.py -> build/lib.linux-x86_64-cpython-310/xformers/components/attention copying xformers/components/attention/favor.py -> build/lib.linux-x86_64-cpython-310/xformers/components/attention copying xformers/components/attention/fourier_mix.py -> build/lib.linux-x86_64-cpython-310/xformers/components/attention copying xformers/components/attention/global_tokens.py -> build/lib.linux-x86_64-cpython-310/xformers/components/attention copying xformers/components/attention/lambda_layer.py -> build/lib.linux-x86_64-cpython-310/xformers/components/attention copying xformers/components/attention/linformer.py -> build/lib.linux-x86_64-cpython-310/xformers/components/attention copying xformers/components/attention/local.py -> build/lib.linux-x86_64-cpython-310/xformers/components/attention copying xformers/components/attention/nystrom.py -> build/lib.linux-x86_64-cpython-310/xformers/components/attention copying xformers/components/attention/ortho.py -> build/lib.linux-x86_64-cpython-310/xformers/components/attention copying xformers/components/attention/pooling.py -> build/lib.linux-x86_64-cpython-310/xformers/components/attention copying xformers/components/attention/random.py -> build/lib.linux-x86_64-cpython-310/xformers/components/attention copying xformers/components/attention/scaled_dot_product.py -> build/lib.linux-x86_64-cpython-310/xformers/components/attention copying xformers/components/attention/sparsity_config.py -> build/lib.linux-x86_64-cpython-310/xformers/components/attention copying xformers/components/attention/utils.py -> build/lib.linux-x86_64-cpython-310/xformers/components/attention copying xformers/components/attention/visual.py -> build/lib.linux-x86_64-cpython-310/xformers/components/attention creating build/lib.linux-x86_64-cpython-310/xformers/components/feedforward copying xformers/components/feedforward/__init__.py -> build/lib.linux-x86_64-cpython-310/xformers/components/feedforward copying xformers/components/feedforward/base.py -> build/lib.linux-x86_64-cpython-310/xformers/components/feedforward copying xformers/components/feedforward/conv_mlp.py -> build/lib.linux-x86_64-cpython-310/xformers/components/feedforward copying xformers/components/feedforward/fused_mlp.py -> build/lib.linux-x86_64-cpython-310/xformers/components/feedforward copying xformers/components/feedforward/mixture_of_experts.py -> build/lib.linux-x86_64-cpython-310/xformers/components/feedforward copying xformers/components/feedforward/mlp.py -> build/lib.linux-x86_64-cpython-310/xformers/components/feedforward creating build/lib.linux-x86_64-cpython-310/xformers/components/positional_embedding copying xformers/components/positional_embedding/__init__.py -> build/lib.linux-x86_64-cpython-310/xformers/components/positional_embedding copying xformers/components/positional_embedding/base.py -> build/lib.linux-x86_64-cpython-310/xformers/components/positional_embedding copying xformers/components/positional_embedding/param.py -> build/lib.linux-x86_64-cpython-310/xformers/components/positional_embedding copying xformers/components/positional_embedding/rotary.py -> build/lib.linux-x86_64-cpython-310/xformers/components/positional_embedding copying xformers/components/positional_embedding/sine.py -> build/lib.linux-x86_64-cpython-310/xformers/components/positional_embedding copying xformers/components/positional_embedding/vocab.py -> build/lib.linux-x86_64-cpython-310/xformers/components/positional_embedding creating build/lib.linux-x86_64-cpython-310/xformers/components/attention/feature_maps copying xformers/components/attention/feature_maps/__init__.py -> build/lib.linux-x86_64-cpython-310/xformers/components/attention/feature_maps copying xformers/components/attention/feature_maps/base.py -> build/lib.linux-x86_64-cpython-310/xformers/components/attention/feature_maps copying xformers/components/attention/feature_maps/softmax.py -> build/lib.linux-x86_64-cpython-310/xformers/components/attention/feature_maps creating build/lib.linux-x86_64-cpython-310/xformers/ops/_triton copying xformers/ops/_triton/__init__.py -> build/lib.linux-x86_64-cpython-310/xformers/ops/_triton copying xformers/ops/_triton/k_index_select_cat.py -> build/lib.linux-x86_64-cpython-310/xformers/ops/_triton copying xformers/ops/_triton/k_scaled_index_add.py -> build/lib.linux-x86_64-cpython-310/xformers/ops/_triton copying xformers/ops/_triton/rmsnorm_kernels.py -> build/lib.linux-x86_64-cpython-310/xformers/ops/_triton copying xformers/ops/_triton/rope_padded_kernels.py -> build/lib.linux-x86_64-cpython-310/xformers/ops/_triton creating build/lib.linux-x86_64-cpython-310/xformers/ops/fmha copying xformers/ops/fmha/__init__.py -> build/lib.linux-x86_64-cpython-310/xformers/ops/fmha copying xformers/ops/fmha/attn_bias.py -> build/lib.linux-x86_64-cpython-310/xformers/ops/fmha copying xformers/ops/fmha/common.py -> build/lib.linux-x86_64-cpython-310/xformers/ops/fmha copying xformers/ops/fmha/cutlass.py -> build/lib.linux-x86_64-cpython-310/xformers/ops/fmha copying xformers/ops/fmha/decoder.py -> build/lib.linux-x86_64-cpython-310/xformers/ops/fmha copying xformers/ops/fmha/dispatch.py -> build/lib.linux-x86_64-cpython-310/xformers/ops/fmha copying xformers/ops/fmha/flash.py -> build/lib.linux-x86_64-cpython-310/xformers/ops/fmha copying xformers/ops/fmha/small_k.py -> build/lib.linux-x86_64-cpython-310/xformers/ops/fmha copying xformers/ops/fmha/triton.py -> build/lib.linux-x86_64-cpython-310/xformers/ops/fmha copying xformers/ops/fmha/triton_splitk.py -> build/lib.linux-x86_64-cpython-310/xformers/ops/fmha creating build/lib.linux-x86_64-cpython-310/xformers/_flash_attn/layers copying xformers/_flash_attn/layers/__init__.py -> build/lib.linux-x86_64-cpython-310/xformers/_flash_attn/layers copying xformers/_flash_attn/layers/patch_embed.py -> build/lib.linux-x86_64-cpython-310/xformers/_flash_attn/layers copying xformers/_flash_attn/layers/rotary.py -> build/lib.linux-x86_64-cpython-310/xformers/_flash_attn/layers creating build/lib.linux-x86_64-cpython-310/xformers/_flash_attn/losses copying xformers/_flash_attn/losses/__init__.py -> build/lib.linux-x86_64-cpython-310/xformers/_flash_attn/losses copying xformers/_flash_attn/losses/cross_entropy.py -> build/lib.linux-x86_64-cpython-310/xformers/_flash_attn/losses creating build/lib.linux-x86_64-cpython-310/xformers/_flash_attn/models copying xformers/_flash_attn/models/__init__.py -> build/lib.linux-x86_64-cpython-310/xformers/_flash_attn/models copying xformers/_flash_attn/models/baichuan.py -> build/lib.linux-x86_64-cpython-310/xformers/_flash_attn/models copying xformers/_flash_attn/models/bert.py -> build/lib.linux-x86_64-cpython-310/xformers/_flash_attn/models copying xformers/_flash_attn/models/bigcode.py -> build/lib.linux-x86_64-cpython-310/xformers/_flash_attn/models copying xformers/_flash_attn/models/falcon.py -> build/lib.linux-x86_64-cpython-310/xformers/_flash_attn/models copying xformers/_flash_attn/models/gpt.py -> build/lib.linux-x86_64-cpython-310/xformers/_flash_attn/models copying xformers/_flash_attn/models/gpt_neox.py -> build/lib.linux-x86_64-cpython-310/xformers/_flash_attn/models copying xformers/_flash_attn/models/gptj.py -> build/lib.linux-x86_64-cpython-310/xformers/_flash_attn/models copying xformers/_flash_attn/models/llama.py -> build/lib.linux-x86_64-cpython-310/xformers/_flash_attn/models copying xformers/_flash_attn/models/opt.py -> build/lib.linux-x86_64-cpython-310/xformers/_flash_attn/models copying xformers/_flash_attn/models/vit.py -> build/lib.linux-x86_64-cpython-310/xformers/_flash_attn/models creating build/lib.linux-x86_64-cpython-310/xformers/_flash_attn/modules copying xformers/_flash_attn/modules/__init__.py -> build/lib.linux-x86_64-cpython-310/xformers/_flash_attn/modules copying xformers/_flash_attn/modules/block.py -> build/lib.linux-x86_64-cpython-310/xformers/_flash_attn/modules copying xformers/_flash_attn/modules/embedding.py -> build/lib.linux-x86_64-cpython-310/xformers/_flash_attn/modules copying xformers/_flash_attn/modules/mha.py -> build/lib.linux-x86_64-cpython-310/xformers/_flash_attn/modules copying xformers/_flash_attn/modules/mlp.py -> build/lib.linux-x86_64-cpython-310/xformers/_flash_attn/modules creating build/lib.linux-x86_64-cpython-310/xformers/_flash_attn/ops copying xformers/_flash_attn/ops/__init__.py -> build/lib.linux-x86_64-cpython-310/xformers/_flash_attn/ops copying xformers/_flash_attn/ops/activations.py -> build/lib.linux-x86_64-cpython-310/xformers/_flash_attn/ops copying xformers/_flash_attn/ops/fused_dense.py -> build/lib.linux-x86_64-cpython-310/xformers/_flash_attn/ops copying xformers/_flash_attn/ops/layer_norm.py -> build/lib.linux-x86_64-cpython-310/xformers/_flash_attn/ops copying xformers/_flash_attn/ops/rms_norm.py -> build/lib.linux-x86_64-cpython-310/xformers/_flash_attn/ops creating build/lib.linux-x86_64-cpython-310/xformers/_flash_attn/utils copying xformers/_flash_attn/utils/__init__.py -> build/lib.linux-x86_64-cpython-310/xformers/_flash_attn/utils copying xformers/_flash_attn/utils/benchmark.py -> build/lib.linux-x86_64-cpython-310/xformers/_flash_attn/utils copying xformers/_flash_attn/utils/distributed.py -> build/lib.linux-x86_64-cpython-310/xformers/_flash_attn/utils copying xformers/_flash_attn/utils/generation.py -> build/lib.linux-x86_64-cpython-310/xformers/_flash_attn/utils copying xformers/_flash_attn/utils/pretrained.py -> build/lib.linux-x86_64-cpython-310/xformers/_flash_attn/utils creating build/lib.linux-x86_64-cpython-310/xformers/_flash_attn/ops/triton copying xformers/_flash_attn/ops/triton/__init__.py -> build/lib.linux-x86_64-cpython-310/xformers/_flash_attn/ops/triton copying xformers/_flash_attn/ops/triton/cross_entropy.py -> build/lib.linux-x86_64-cpython-310/xformers/_flash_attn/ops/triton copying xformers/_flash_attn/ops/triton/k_activations.py -> build/lib.linux-x86_64-cpython-310/xformers/_flash_attn/ops/triton copying xformers/_flash_attn/ops/triton/layernorm.py -> build/lib.linux-x86_64-cpython-310/xformers/_flash_attn/ops/triton copying xformers/_flash_attn/ops/triton/linear.py -> build/lib.linux-x86_64-cpython-310/xformers/_flash_attn/ops/triton copying xformers/_flash_attn/ops/triton/mlp.py -> build/lib.linux-x86_64-cpython-310/xformers/_flash_attn/ops/triton copying xformers/_flash_attn/ops/triton/rotary.py -> build/lib.linux-x86_64-cpython-310/xformers/_flash_attn/ops/triton running build_ext /usr/lib64/python3.10/site-packages/torch/utils/cpp_extension.py:398: UserWarning: There are no g++ version bounds defined for CUDA version 12.1 warnings.warn(f'There are no {compiler_name} version bounds defined for CUDA version {cuda_str_version}') building 'xformers._C_flashattention' extension creating /builddir/build/BUILD/xformers-0.0.23.post1/build/temp.linux-x86_64-cpython-310 creating /builddir/build/BUILD/xformers-0.0.23.post1/build/temp.linux-x86_64-cpython-310/builddir creating /builddir/build/BUILD/xformers-0.0.23.post1/build/temp.linux-x86_64-cpython-310/builddir/build creating /builddir/build/BUILD/xformers-0.0.23.post1/build/temp.linux-x86_64-cpython-310/builddir/build/BUILD creating /builddir/build/BUILD/xformers-0.0.23.post1/build/temp.linux-x86_64-cpython-310/builddir/build/BUILD/xformers-0.0.23.post1 creating /builddir/build/BUILD/xformers-0.0.23.post1/build/temp.linux-x86_64-cpython-310/builddir/build/BUILD/xformers-0.0.23.post1/third_party creating /builddir/build/BUILD/xformers-0.0.23.post1/build/temp.linux-x86_64-cpython-310/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention creating /builddir/build/BUILD/xformers-0.0.23.post1/build/temp.linux-x86_64-cpython-310/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc creating /builddir/build/BUILD/xformers-0.0.23.post1/build/temp.linux-x86_64-cpython-310/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn creating /builddir/build/BUILD/xformers-0.0.23.post1/build/temp.linux-x86_64-cpython-310/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src Emitting ninja build file /builddir/build/BUILD/xformers-0.0.23.post1/build/temp.linux-x86_64-cpython-310/build.ninja... Compiling objects... Using envvar MAX_JOBS (4) as the number of workers... [1/49] g++ -MMD -MF /builddir/build/BUILD/xformers-0.0.23.post1/build/temp.linux-x86_64-cpython-310/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/flash_api.o.d -Wno-unused-result -Wsign-compare -DNDEBUG -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -D_GNU_SOURCE -fPIC -fwrapv -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -D_GNU_SOURCE -fPIC -fwrapv -O2 -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -D_GNU_SOURCE -fPIC -fwrapv -O2 -flto=auto -ffat-lto-objects -fexceptions -g -grecord-gcc-switches -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -specs=/usr/lib/rpm/anolis/anolis-hardened-cc1 -fstack-protector-strong -specs=/usr/lib/rpm/anolis/anolis-annobin-cc1 -m64 -march=x86-64-v2 -mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection -fcf-protection -fPIC -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/flash_api.cpp -o /builddir/build/BUILD/xformers-0.0.23.post1/build/temp.linux-x86_64-cpython-310/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/flash_api.o -O3 -std=c++17 -fopenmp -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 /builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/flash_api.cpp: In function ‘void set_params_fprop(Flash_fwd_params&, size_t, size_t, size_t, size_t, size_t, size_t, size_t, size_t, size_t, at::Tensor, at::Tensor, at::Tensor, at::Tensor, void*, void*, void*, void*, void*, float, float, int, int)’: /builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/flash_api.cpp:48:11: warning: ‘void* memset(void*, int, size_t)’ clearing an object of non-trivial type ‘struct Flash_fwd_params’; use assignment or value-initialization instead [-Wclass-memaccess] 48 | memset(¶ms, 0, sizeof(params)); | ~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~ In file included from /builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/flash_api.cpp:13: /builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash.h:51:8: note: ‘struct Flash_fwd_params’ declared here 51 | struct Flash_fwd_params : public Qkv_params { | ^~~~~~~~~~~~~~~~ [2/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim160_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.23.post1/build/temp.linux-x86_64-cpython-310/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim160_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 --generate-line-info -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 --generate-line-info -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 218 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 211 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 214 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 206 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 208 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 206 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 227 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 223 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 217 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 216 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 50 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 24 bytes stack frame, 20 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 50 registers, 800 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 218 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 211 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 214 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 206 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 208 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 206 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 227 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 223 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 217 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 216 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 52 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 24 bytes stack frame, 20 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 52 registers, 800 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 216 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 218 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 211 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 213 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 208 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 220 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 217 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 212 registers ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 218 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 216 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 51 registers ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 235 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 235 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 227 registers ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 51 registers [3/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim160_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.23.post1/build/temp.linux-x86_64-cpython-310/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim160_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 --generate-line-info -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 --generate-line-info -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 218 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 211 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 214 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 206 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 208 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 206 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 227 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 223 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 217 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 216 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 51 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 24 bytes stack frame, 20 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 51 registers, 800 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 218 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 211 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 214 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 206 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 208 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 206 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 227 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 223 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 217 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 216 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 53 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 24 bytes stack frame, 20 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 53 registers, 800 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 216 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 218 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 211 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 213 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 208 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 220 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 217 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 212 registers ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 218 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 216 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 52 registers ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 235 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 235 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 227 registers ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi160ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 52 registers [4/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim128_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.23.post1/build/temp.linux-x86_64-cpython-310/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim128_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 --generate-line-info -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 --generate-line-info -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 24 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 16 bytes stack frame, 16 bytes spill stores, 20 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 32 bytes spill stores, 36 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 44 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 80 bytes stack frame, 76 bytes spill stores, 76 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 48 bytes stack frame, 48 bytes spill stores, 48 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 44 registers, 800 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 8 bytes stack frame, 8 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 48 bytes stack frame, 48 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 56 bytes stack frame, 52 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 45 registers ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 24 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 45 registers ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 24 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 16 bytes stack frame, 16 bytes spill stores, 20 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 32 bytes spill stores, 36 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 44 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 80 bytes stack frame, 76 bytes spill stores, 76 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 48 bytes stack frame, 48 bytes spill stores, 48 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 44 registers, 800 bytes cmem[0] [5/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim128_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.23.post1/build/temp.linux-x86_64-cpython-310/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim128_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 --generate-line-info -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 --generate-line-info -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 8 bytes stack frame, 8 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 48 bytes stack frame, 48 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 56 bytes stack frame, 52 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 46 registers ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 24 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 46 registers ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 24 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 16 bytes stack frame, 16 bytes spill stores, 20 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 32 bytes spill stores, 36 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 47 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 80 bytes stack frame, 76 bytes spill stores, 76 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 48 bytes stack frame, 48 bytes spill stores, 48 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 47 registers, 800 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 24 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 16 bytes stack frame, 16 bytes spill stores, 20 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 32 bytes spill stores, 36 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi128ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 46 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 80 bytes stack frame, 76 bytes spill stores, 76 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 48 bytes stack frame, 48 bytes spill stores, 48 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi128ELi64ELi128ELi8ELi2ELi4ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 46 registers, 800 bytes cmem[0] [6/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim192_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.23.post1/build/temp.linux-x86_64-cpython-310/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim192_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 --generate-line-info -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 --generate-line-info -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 184 bytes stack frame, 216 bytes spill stores, 212 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 200 bytes stack frame, 228 bytes spill stores, 224 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 184 bytes stack frame, 212 bytes spill stores, 208 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 184 bytes stack frame, 216 bytes spill stores, 212 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 184 bytes stack frame, 216 bytes spill stores, 212 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 184 bytes stack frame, 216 bytes spill stores, 212 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 208 bytes stack frame, 248 bytes spill stores, 248 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 216 bytes stack frame, 256 bytes spill stores, 252 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 200 bytes stack frame, 228 bytes spill stores, 224 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 208 bytes stack frame, 236 bytes spill stores, 232 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 216 bytes stack frame, 252 bytes spill stores, 248 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 200 bytes stack frame, 232 bytes spill stores, 228 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 62 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 62 registers, 800 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 200 bytes stack frame, 220 bytes spill stores, 220 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 200 bytes stack frame, 228 bytes spill stores, 224 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 192 bytes stack frame, 216 bytes spill stores, 212 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 184 bytes stack frame, 216 bytes spill stores, 212 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 200 bytes stack frame, 220 bytes spill stores, 216 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 200 bytes stack frame, 220 bytes spill stores, 216 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 251 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 216 bytes stack frame, 248 bytes spill stores, 248 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 224 bytes stack frame, 256 bytes spill stores, 252 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 208 bytes stack frame, 232 bytes spill stores, 228 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 208 bytes stack frame, 236 bytes spill stores, 232 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 216 bytes stack frame, 252 bytes spill stores, 248 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 216 bytes stack frame, 236 bytes spill stores, 232 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 62 registers ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 251 registers ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 62 registers ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 184 bytes stack frame, 216 bytes spill stores, 212 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 200 bytes stack frame, 228 bytes spill stores, 224 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 184 bytes stack frame, 212 bytes spill stores, 208 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 184 bytes stack frame, 216 bytes spill stores, 212 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 184 bytes stack frame, 216 bytes spill stores, 212 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 184 bytes stack frame, 216 bytes spill stores, 212 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 208 bytes stack frame, 248 bytes spill stores, 248 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 216 bytes stack frame, 256 bytes spill stores, 252 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 200 bytes stack frame, 228 bytes spill stores, 224 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 208 bytes stack frame, 236 bytes spill stores, 232 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 216 bytes stack frame, 252 bytes spill stores, 248 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 200 bytes stack frame, 232 bytes spill stores, 228 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 61 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 61 registers, 800 bytes cmem[0] [7/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim224_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.23.post1/build/temp.linux-x86_64-cpython-310/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim224_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 --generate-line-info -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 --generate-line-info -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 251 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 251 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 32 bytes spill stores, 36 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 800 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 32 bytes spill stores, 36 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 67 registers, 800 bytes cmem[0] [8/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim224_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.23.post1/build/temp.linux-x86_64-cpython-310/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim224_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 --generate-line-info -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 --generate-line-info -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 251 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 251 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 32 bytes spill stores, 36 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 69 registers, 800 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 32 bytes spill stores, 36 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi224ELi64ELi64ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 67 registers, 800 bytes cmem[0] [9/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim192_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.23.post1/build/temp.linux-x86_64-cpython-310/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim192_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 --generate-line-info -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 --generate-line-info -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 184 bytes stack frame, 216 bytes spill stores, 212 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 200 bytes stack frame, 228 bytes spill stores, 224 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 184 bytes stack frame, 212 bytes spill stores, 208 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 184 bytes stack frame, 216 bytes spill stores, 212 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 184 bytes stack frame, 216 bytes spill stores, 212 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 184 bytes stack frame, 216 bytes spill stores, 212 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 208 bytes stack frame, 248 bytes spill stores, 248 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 216 bytes stack frame, 256 bytes spill stores, 252 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 200 bytes stack frame, 228 bytes spill stores, 224 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 208 bytes stack frame, 236 bytes spill stores, 232 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 216 bytes stack frame, 252 bytes spill stores, 248 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 200 bytes stack frame, 232 bytes spill stores, 228 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers, 800 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 200 bytes stack frame, 220 bytes spill stores, 220 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 200 bytes stack frame, 228 bytes spill stores, 224 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 192 bytes stack frame, 216 bytes spill stores, 212 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 184 bytes stack frame, 216 bytes spill stores, 212 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 200 bytes stack frame, 220 bytes spill stores, 216 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 200 bytes stack frame, 220 bytes spill stores, 216 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 251 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 216 bytes stack frame, 248 bytes spill stores, 248 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 224 bytes stack frame, 256 bytes spill stores, 252 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 208 bytes stack frame, 232 bytes spill stores, 228 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 208 bytes stack frame, 236 bytes spill stores, 232 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 216 bytes stack frame, 252 bytes spill stores, 248 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 216 bytes stack frame, 236 bytes spill stores, 232 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 251 registers ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 64 registers ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 184 bytes stack frame, 216 bytes spill stores, 212 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 200 bytes stack frame, 228 bytes spill stores, 224 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 184 bytes stack frame, 212 bytes spill stores, 208 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 184 bytes stack frame, 216 bytes spill stores, 212 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 184 bytes stack frame, 216 bytes spill stores, 212 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 184 bytes stack frame, 216 bytes spill stores, 212 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 208 bytes stack frame, 248 bytes spill stores, 248 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 216 bytes stack frame, 256 bytes spill stores, 252 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 200 bytes stack frame, 228 bytes spill stores, 224 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 208 bytes stack frame, 236 bytes spill stores, 232 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 216 bytes stack frame, 252 bytes spill stores, 248 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 200 bytes stack frame, 232 bytes spill stores, 228 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb1ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 63 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi192ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 63 registers, 800 bytes cmem[0] [10/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim32_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.23.post1/build/temp.linux-x86_64-cpython-310/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim32_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 --generate-line-info -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 --generate-line-info -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 64 bytes stack frame, 60 bytes spill stores, 64 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 56 bytes stack frame, 52 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 64 bytes stack frame, 60 bytes spill stores, 60 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 64 bytes stack frame, 60 bytes spill stores, 64 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 56 bytes stack frame, 52 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 64 bytes stack frame, 60 bytes spill stores, 60 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 36 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 36 registers, 800 bytes cmem[0] [11/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim256_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.23.post1/build/temp.linux-x86_64-cpython-310/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim256_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 --generate-line-info -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 --generate-line-info -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 80 bytes stack frame, 88 bytes spill stores, 96 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 56 bytes stack frame, 56 bytes spill stores, 60 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 48 bytes stack frame, 48 bytes spill stores, 48 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 48 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 88 bytes stack frame, 92 bytes spill stores, 100 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 56 bytes stack frame, 52 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 79 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 88 bytes stack frame, 96 bytes spill stores, 104 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 64 bytes stack frame, 64 bytes spill stores, 64 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 64 bytes stack frame, 64 bytes spill stores, 64 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 56 bytes stack frame, 56 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 104 bytes stack frame, 108 bytes spill stores, 116 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 72 bytes stack frame, 68 bytes spill stores, 68 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 79 registers, 800 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 80 registers ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 48 bytes stack frame, 56 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 80 registers ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 80 bytes stack frame, 88 bytes spill stores, 96 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 56 bytes stack frame, 56 bytes spill stores, 60 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 48 bytes stack frame, 48 bytes spill stores, 48 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 48 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 88 bytes stack frame, 92 bytes spill stores, 100 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 56 bytes stack frame, 52 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 80 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 88 bytes stack frame, 96 bytes spill stores, 104 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 64 bytes stack frame, 64 bytes spill stores, 64 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 64 bytes stack frame, 64 bytes spill stores, 64 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 56 bytes stack frame, 56 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 104 bytes stack frame, 108 bytes spill stores, 116 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 72 bytes stack frame, 68 bytes spill stores, 68 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 80 registers, 800 bytes cmem[0] [12/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim32_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.23.post1/build/temp.linux-x86_64-cpython-310/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim32_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 --generate-line-info -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 --generate-line-info -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 64 bytes stack frame, 60 bytes spill stores, 64 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 56 bytes stack frame, 52 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 64 bytes stack frame, 60 bytes spill stores, 60 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 64 bytes stack frame, 60 bytes spill stores, 64 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 56 bytes stack frame, 52 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 64 bytes stack frame, 60 bytes spill stores, 60 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 37 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi32ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 37 registers, 800 bytes cmem[0] [13/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim256_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.23.post1/build/temp.linux-x86_64-cpython-310/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim256_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 --generate-line-info -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 --generate-line-info -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 80 bytes stack frame, 88 bytes spill stores, 96 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 56 bytes stack frame, 56 bytes spill stores, 60 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 48 bytes stack frame, 48 bytes spill stores, 48 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 48 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 88 bytes stack frame, 92 bytes spill stores, 100 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 56 bytes stack frame, 52 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 80 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 88 bytes stack frame, 96 bytes spill stores, 104 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 64 bytes stack frame, 64 bytes spill stores, 64 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 64 bytes stack frame, 64 bytes spill stores, 64 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 56 bytes stack frame, 56 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 104 bytes stack frame, 108 bytes spill stores, 116 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 72 bytes stack frame, 68 bytes spill stores, 68 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 80 registers, 800 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 88 registers ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 48 bytes stack frame, 56 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 88 registers ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 80 bytes stack frame, 88 bytes spill stores, 96 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 56 bytes stack frame, 56 bytes spill stores, 60 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 48 bytes stack frame, 48 bytes spill stores, 48 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 48 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 88 bytes stack frame, 92 bytes spill stores, 100 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 56 bytes stack frame, 52 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb1EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 80 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 88 bytes stack frame, 96 bytes spill stores, 104 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 64 bytes stack frame, 64 bytes spill stores, 64 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 64 bytes stack frame, 64 bytes spill stores, 64 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 56 bytes stack frame, 56 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 104 bytes stack frame, 108 bytes spill stores, 116 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 72 bytes stack frame, 68 bytes spill stores, 68 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi256ELi64ELi64ELi8ELi4ELi2ELi2ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 80 registers, 800 bytes cmem[0] [14/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim96_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.23.post1/build/temp.linux-x86_64-cpython-310/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim96_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 --generate-line-info -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 --generate-line-info -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 24 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 48 bytes stack frame, 68 bytes spill stores, 60 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 40 bytes stack frame, 48 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 48 bytes stack frame, 64 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 24 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 24 bytes stack frame, 20 bytes spill stores, 20 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 88 bytes stack frame, 100 bytes spill stores, 100 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 104 bytes stack frame, 132 bytes spill stores, 120 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 48 bytes stack frame, 60 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 24 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 16 bytes stack frame, 16 bytes spill stores, 20 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 64 bytes stack frame, 84 bytes spill stores, 76 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 48 bytes stack frame, 56 bytes spill stores, 60 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 36 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 24 bytes stack frame, 20 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 36 registers, 800 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 24 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 48 bytes stack frame, 68 bytes spill stores, 60 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 40 bytes stack frame, 48 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 48 bytes stack frame, 64 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 24 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 24 bytes stack frame, 20 bytes spill stores, 20 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 88 bytes stack frame, 100 bytes spill stores, 100 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 104 bytes stack frame, 132 bytes spill stores, 120 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 48 bytes stack frame, 60 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 24 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 16 bytes stack frame, 16 bytes spill stores, 20 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 64 bytes stack frame, 84 bytes spill stores, 76 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 48 bytes stack frame, 56 bytes spill stores, 60 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 36 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 24 bytes stack frame, 20 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 36 registers, 800 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 24 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 40 bytes stack frame, 60 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 40 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 16 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 40 bytes stack frame, 56 bytes spill stores, 48 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 8 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 24 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 40 bytes stack frame, 48 bytes spill stores, 48 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 56 bytes stack frame, 76 bytes spill stores, 68 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 52 bytes spill stores, 48 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 40 bytes stack frame, 52 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 24 bytes stack frame, 24 bytes spill stores, 28 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 48 bytes stack frame, 64 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 32 bytes stack frame, 36 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 40 bytes stack frame, 52 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 36 registers ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 36 registers [15/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim96_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.23.post1/build/temp.linux-x86_64-cpython-310/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim96_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 --generate-line-info -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 --generate-line-info -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 24 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 40 bytes stack frame, 60 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 40 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 16 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 40 bytes stack frame, 56 bytes spill stores, 48 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 8 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 24 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 40 bytes stack frame, 48 bytes spill stores, 48 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 56 bytes stack frame, 76 bytes spill stores, 68 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 52 bytes spill stores, 48 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 40 bytes stack frame, 52 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 24 bytes stack frame, 24 bytes spill stores, 28 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 48 bytes stack frame, 64 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 32 bytes stack frame, 36 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 40 bytes stack frame, 52 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 35 registers ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 35 registers ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 24 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 48 bytes stack frame, 68 bytes spill stores, 60 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 40 bytes stack frame, 48 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 48 bytes stack frame, 64 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 24 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 24 bytes stack frame, 20 bytes spill stores, 20 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 88 bytes stack frame, 100 bytes spill stores, 100 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 104 bytes stack frame, 132 bytes spill stores, 120 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 48 bytes stack frame, 60 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 24 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 16 bytes stack frame, 16 bytes spill stores, 20 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 64 bytes stack frame, 84 bytes spill stores, 76 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 48 bytes stack frame, 56 bytes spill stores, 60 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 36 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 24 bytes stack frame, 20 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 36 registers, 800 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 24 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 48 bytes stack frame, 68 bytes spill stores, 60 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 40 bytes stack frame, 48 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 48 bytes stack frame, 64 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 24 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 24 bytes stack frame, 20 bytes spill stores, 20 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 88 bytes stack frame, 100 bytes spill stores, 100 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 104 bytes stack frame, 132 bytes spill stores, 120 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 48 bytes stack frame, 60 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 24 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 16 bytes stack frame, 16 bytes spill stores, 20 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 64 bytes stack frame, 84 bytes spill stores, 76 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 48 bytes stack frame, 56 bytes spill stores, 60 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 34 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 24 bytes stack frame, 20 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 16 bytes stack frame, 12 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi96ELi64ELi128ELi8ELi2ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 34 registers, 800 bytes cmem[0] [16/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim64_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.23.post1/build/temp.linux-x86_64-cpython-310/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim64_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 --generate-line-info -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 --generate-line-info -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 80 bytes stack frame, 88 bytes spill stores, 84 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 48 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 24 bytes stack frame, 24 bytes spill stores, 28 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 24 bytes stack frame, 20 bytes spill stores, 20 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 48 bytes stack frame, 48 bytes spill stores, 48 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 16 bytes stack frame, 32 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 36 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 56 bytes stack frame, 52 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 64 bytes stack frame, 60 bytes spill stores, 60 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 24 bytes stack frame, 20 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 64 bytes stack frame, 64 bytes spill stores, 64 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 48 bytes stack frame, 60 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 56 bytes stack frame, 56 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 45 registers, 800 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 251 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 251 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 104 bytes stack frame, 120 bytes spill stores, 108 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 112 bytes stack frame, 136 bytes spill stores, 124 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 24 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 40 bytes stack frame, 36 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 48 bytes stack frame, 44 bytes spill stores, 48 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 160 bytes stack frame, 192 bytes spill stores, 184 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 168 bytes stack frame, 200 bytes spill stores, 192 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 64 bytes stack frame, 64 bytes spill stores, 60 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 16 bytes stack frame, 20 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 24 bytes stack frame, 20 bytes spill stores, 20 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 80 bytes stack frame, 88 bytes spill stores, 84 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 48 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 24 bytes stack frame, 24 bytes spill stores, 28 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 24 bytes stack frame, 20 bytes spill stores, 20 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 48 bytes stack frame, 48 bytes spill stores, 48 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 16 bytes stack frame, 32 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 56 bytes stack frame, 52 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 64 bytes stack frame, 60 bytes spill stores, 60 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 24 bytes stack frame, 20 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 64 bytes stack frame, 64 bytes spill stores, 64 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 48 bytes stack frame, 60 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 56 bytes stack frame, 56 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] [17/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim64_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.23.post1/build/temp.linux-x86_64-cpython-310/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_bwd_hdim64_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 --generate-line-info -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 --generate-line-info -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 251 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 251 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 104 bytes stack frame, 120 bytes spill stores, 108 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 112 bytes stack frame, 136 bytes spill stores, 124 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 24 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 40 bytes stack frame, 36 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 48 bytes stack frame, 44 bytes spill stores, 48 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 160 bytes stack frame, 192 bytes spill stores, 184 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 168 bytes stack frame, 200 bytes spill stores, 192 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 64 bytes stack frame, 64 bytes spill stores, 60 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 16 bytes stack frame, 20 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 24 bytes stack frame, 20 bytes spill stores, 20 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_90' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 34 registers ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 80 bytes stack frame, 88 bytes spill stores, 84 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 48 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 24 bytes stack frame, 24 bytes spill stores, 28 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 24 bytes stack frame, 20 bytes spill stores, 20 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 48 bytes stack frame, 48 bytes spill stores, 48 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 16 bytes stack frame, 32 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 37 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 56 bytes stack frame, 52 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 64 bytes stack frame, 60 bytes spill stores, 60 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 24 bytes stack frame, 20 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 64 bytes stack frame, 64 bytes spill stores, 64 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 48 bytes stack frame, 60 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 56 bytes stack frame, 56 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_86' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 800 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 80 bytes stack frame, 88 bytes spill stores, 84 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 48 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 24 bytes stack frame, 24 bytes spill stores, 28 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 24 bytes stack frame, 20 bytes spill stores, 20 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 48 bytes stack frame, 48 bytes spill stores, 48 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 16 bytes stack frame, 32 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb0ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 40 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi64ELi64ELi128ELi8ELi2ELi4ELi4ELb1ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi64ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z27flash_bwd_convert_dq_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb1EEv16Flash_bwd_params 56 bytes stack frame, 52 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb0ELb0EEv16Flash_bwd_params 64 bytes stack frame, 60 bytes spill stores, 60 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb0EEv16Flash_bwd_params 32 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb0ELb1ELb1EEv16Flash_bwd_params 40 bytes stack frame, 36 bytes spill stores, 36 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb0ELb1ELb0ELb1EEv16Flash_bwd_params 24 bytes stack frame, 20 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb0EEv16Flash_bwd_params 64 bytes stack frame, 64 bytes spill stores, 64 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb1ELb1EEv16Flash_bwd_params 48 bytes stack frame, 60 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z44flash_bwd_dq_dk_dv_loop_seqk_parallel_kernelI23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EELb1ELb1ELb0ELb0ELb1EEv16Flash_bwd_params 56 bytes stack frame, 56 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 800 bytes cmem[0] ptxas info : Compiling entry function '_Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params' for 'sm_80' ptxas info : Function properties for _Z25flash_bwd_dot_do_o_kernelILb1E23Flash_bwd_kernel_traitsILi64ELi128ELi128ELi8ELi4ELi4ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi8ES2_EEEv16Flash_bwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 800 bytes cmem[0] [18/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim128_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.23.post1/build/temp.linux-x86_64-cpython-310/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim128_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 --generate-line-info -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 --generate-line-info -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 48 bytes stack frame, 64 bytes spill stores, 64 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 96 bytes stack frame, 144 bytes spill stores, 248 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 32 bytes stack frame, 60 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 144 bytes stack frame, 264 bytes spill stores, 220 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 249 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 96 bytes stack frame, 228 bytes spill stores, 260 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 128 bytes stack frame, 208 bytes spill stores, 264 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 112 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 136 bytes stack frame, 240 bytes spill stores, 232 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 191 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 184 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 202 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 56 bytes stack frame, 56 bytes spill stores, 104 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 440 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 108 bytes spill stores, 272 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 80 bytes stack frame, 100 bytes spill stores, 92 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 48 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 120 bytes stack frame, 252 bytes spill stores, 300 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 56 bytes stack frame, 52 bytes spill stores, 124 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 408 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES3_EELb1ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 64 bytes spill stores, 140 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 40 bytes stack frame, 56 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 104 bytes stack frame, 172 bytes spill stores, 256 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 32 bytes stack frame, 60 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 144 bytes stack frame, 264 bytes spill stores, 220 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 249 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 96 bytes stack frame, 224 bytes spill stores, 248 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 248 bytes stack frame, 388 bytes spill stores, 468 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 112 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 136 bytes stack frame, 240 bytes spill stores, 232 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 179 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 184 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 202 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 32 bytes stack frame, 32 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 408 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 76 bytes spill stores, 128 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 80 bytes stack frame, 100 bytes spill stores, 92 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 48 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 120 bytes stack frame, 252 bytes spill stores, 300 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 96 bytes stack frame, 172 bytes spill stores, 236 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 392 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES3_EELb1ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 52 bytes spill stores, 96 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 192 bytes spill stores, 192 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 40 bytes stack frame, 56 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 104 bytes stack frame, 176 bytes spill stores, 264 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 24 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 56 bytes stack frame, 88 bytes spill stores, 72 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 160 bytes stack frame, 296 bytes spill stores, 252 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 96 bytes stack frame, 224 bytes spill stores, 224 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 245 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 240 bytes stack frame, 392 bytes spill stores, 480 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 72 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 80 bytes stack frame, 152 bytes spill stores, 128 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 169 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 196 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 189 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 32 bytes stack frame, 36 bytes spill stores, 72 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 432 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 92 bytes spill stores, 220 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 88 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 24 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 224 bytes spill stores, 224 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 80 bytes stack frame, 172 bytes spill stores, 188 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 424 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES3_EELb1ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 84 bytes spill stores, 140 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 48 bytes spill stores, 48 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 160 bytes stack frame, 396 bytes spill stores, 416 bytes spill loads ptxas info : Used 255 registers [19/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim128_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.23.post1/build/temp.linux-x86_64-cpython-310/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim128_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 --generate-line-info -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 --generate-line-info -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 40 bytes stack frame, 56 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 104 bytes stack frame, 172 bytes spill stores, 256 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 32 bytes stack frame, 60 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 144 bytes stack frame, 264 bytes spill stores, 220 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 249 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 96 bytes stack frame, 224 bytes spill stores, 248 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 248 bytes stack frame, 388 bytes spill stores, 468 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 112 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 136 bytes stack frame, 240 bytes spill stores, 232 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 179 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 184 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 202 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 54 bytes spill stores, 54 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 32 bytes stack frame, 32 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 504 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 288 bytes spill stores, 492 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 132 bytes spill stores, 156 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 48 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 192 bytes stack frame, 388 bytes spill stores, 470 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 96 bytes stack frame, 172 bytes spill stores, 236 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 152 bytes stack frame, 174 bytes spill stores, 234 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 224 bytes stack frame, 418 bytes spill stores, 588 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 48 bytes stack frame, 64 bytes spill stores, 64 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 96 bytes stack frame, 144 bytes spill stores, 248 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 32 bytes stack frame, 60 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 144 bytes stack frame, 264 bytes spill stores, 220 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 249 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 96 bytes stack frame, 228 bytes spill stores, 260 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 128 bytes stack frame, 208 bytes spill stores, 264 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 112 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 136 bytes stack frame, 240 bytes spill stores, 232 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 191 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 184 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 202 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 80 bytes stack frame, 62 bytes spill stores, 62 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 56 bytes stack frame, 56 bytes spill stores, 104 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 528 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 392 bytes spill stores, 680 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 132 bytes spill stores, 156 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 48 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 192 bytes stack frame, 388 bytes spill stores, 470 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 56 bytes stack frame, 52 bytes spill stores, 124 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 128 bytes stack frame, 178 bytes spill stores, 258 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 224 bytes stack frame, 418 bytes spill stores, 588 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 40 bytes stack frame, 56 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 104 bytes stack frame, 176 bytes spill stores, 264 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 24 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 56 bytes stack frame, 88 bytes spill stores, 72 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 160 bytes stack frame, 296 bytes spill stores, 252 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 96 bytes stack frame, 224 bytes spill stores, 224 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 245 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 240 bytes stack frame, 392 bytes spill stores, 480 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 72 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 80 bytes stack frame, 152 bytes spill stores, 128 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 169 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 196 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 189 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 88 bytes stack frame, 62 bytes spill stores, 62 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 32 bytes stack frame, 36 bytes spill stores, 72 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 536 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 298 bytes spill stores, 630 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 128 bytes stack frame, 136 bytes spill stores, 152 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 24 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 208 bytes stack frame, 410 bytes spill stores, 568 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 80 bytes stack frame, 172 bytes spill stores, 188 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 152 bytes stack frame, 172 bytes spill stores, 228 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 48 bytes spill stores, 48 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi128ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 240 bytes stack frame, 438 bytes spill stores, 574 bytes spill loads ptxas info : Used 255 registers [20/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim160_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.23.post1/build/temp.linux-x86_64-cpython-310/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim160_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 --generate-line-info -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 --generate-line-info -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 24 bytes stack frame, 40 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 32 bytes stack frame, 116 bytes spill stores, 72 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 4 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 40 bytes stack frame, 184 bytes spill stores, 108 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 201 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 184 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 195 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 64 bytes stack frame, 156 bytes spill stores, 124 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 40 bytes stack frame, 128 bytes spill stores, 88 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 178 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 186 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 24 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 24 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 96 bytes stack frame, 212 bytes spill stores, 244 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 592 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 428 bytes spill stores, 920 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 48 bytes stack frame, 76 bytes spill stores, 60 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 104 bytes spill stores, 92 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 80 bytes stack frame, 240 bytes spill stores, 192 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 592 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES3_EELb1ELb0ELb1ELb0ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 608 bytes spill stores, 852 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 188 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 200 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 208 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 216 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 96 bytes stack frame, 204 bytes spill stores, 232 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 568 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES3_EELb1ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 388 bytes spill stores, 908 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 64 bytes stack frame, 192 bytes spill stores, 180 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 568 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES3_EELb1ELb1ELb0ELb0ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 552 bytes spill stores, 828 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 210 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 190 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 32 bytes stack frame, 60 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 56 bytes stack frame, 136 bytes spill stores, 108 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 32 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 104 bytes spill stores, 64 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 202 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 184 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 187 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 56 bytes stack frame, 136 bytes spill stores, 116 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 72 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 204 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 204 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 48 bytes stack frame, 76 bytes spill stores, 68 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 16 bytes stack frame, 20 bytes spill stores, 20 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 128 bytes stack frame, 164 bytes spill stores, 324 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 616 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 432 bytes spill stores, 912 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 56 bytes stack frame, 84 bytes spill stores, 68 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 64 bytes stack frame, 108 bytes spill stores, 92 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 72 bytes stack frame, 176 bytes spill stores, 200 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 616 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES3_EELb1ELb0ELb1ELb0ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 484 bytes spill stores, 788 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 180 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 228 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 209 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 184 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 128 bytes stack frame, 208 bytes spill stores, 304 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 552 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES3_EELb1ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 324 bytes spill stores, 720 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 144 bytes stack frame, 308 bytes spill stores, 376 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 560 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES3_EELb1ELb1ELb0ELb0ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 408 bytes spill stores, 644 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 220 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 208 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 32 bytes stack frame, 60 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 40 bytes stack frame, 120 bytes spill stores, 76 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 32 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 104 bytes spill stores, 64 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 202 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 184 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 187 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 64 bytes stack frame, 160 bytes spill stores, 140 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 72 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 185 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 204 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 40 bytes stack frame, 64 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 96 bytes stack frame, 176 bytes spill stores, 224 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 552 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 364 bytes spill stores, 800 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 56 bytes stack frame, 84 bytes spill stores, 68 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 64 bytes stack frame, 108 bytes spill stores, 92 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 80 bytes stack frame, 184 bytes spill stores, 208 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 616 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES3_EELb1ELb0ELb1ELb0ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 484 bytes spill stores, 788 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 180 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 228 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 235 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 209 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 184 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 128 bytes stack frame, 212 bytes spill stores, 240 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 520 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES3_EELb1ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 296 bytes spill stores, 680 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 144 bytes stack frame, 308 bytes spill stores, 376 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 560 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES3_EELb1ELb1ELb0ELb0ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 408 bytes spill stores, 644 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 220 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 208 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] [21/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim160_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.23.post1/build/temp.linux-x86_64-cpython-310/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim160_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 --generate-line-info -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 --generate-line-info -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 32 bytes stack frame, 60 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 56 bytes stack frame, 136 bytes spill stores, 108 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 32 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 104 bytes spill stores, 64 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 202 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 184 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 187 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 56 bytes stack frame, 136 bytes spill stores, 116 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 72 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 204 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 204 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 48 bytes stack frame, 76 bytes spill stores, 68 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 144 bytes stack frame, 116 bytes spill stores, 112 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 128 bytes stack frame, 164 bytes spill stores, 324 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 728 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 720 bytes spill stores, 1372 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 56 bytes stack frame, 84 bytes spill stores, 68 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 128 bytes stack frame, 178 bytes spill stores, 162 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 72 bytes stack frame, 176 bytes spill stores, 200 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 752 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES3_EELb1ELb0ELb1ELb0ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 768 bytes spill stores, 1172 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 180 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 209 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 16 bytes spill stores, 50 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 184 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 16 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 128 bytes stack frame, 208 bytes spill stores, 304 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 688 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES3_EELb1ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 698 bytes spill stores, 1186 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 144 bytes stack frame, 308 bytes spill stores, 376 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 152 bytes stack frame, 466 bytes spill stores, 450 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 220 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 208 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 24 bytes stack frame, 40 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 32 bytes stack frame, 116 bytes spill stores, 72 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 4 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 40 bytes stack frame, 184 bytes spill stores, 108 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 201 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 184 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 195 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 64 bytes stack frame, 156 bytes spill stores, 124 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 40 bytes stack frame, 128 bytes spill stores, 88 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 178 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 186 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 24 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 160 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 96 bytes stack frame, 212 bytes spill stores, 244 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 664 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 774 bytes spill stores, 1342 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 48 bytes stack frame, 76 bytes spill stores, 60 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 128 bytes stack frame, 178 bytes spill stores, 162 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 80 bytes stack frame, 240 bytes spill stores, 192 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 664 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES3_EELb1ELb0ELb1ELb0ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 846 bytes spill stores, 1098 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 188 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 200 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 208 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 52 bytes spill stores, 62 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 216 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 152 bytes stack frame, 44 bytes spill stores, 106 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 96 bytes stack frame, 204 bytes spill stores, 232 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 640 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES3_EELb1ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 678 bytes spill stores, 1222 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 64 bytes stack frame, 192 bytes spill stores, 180 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 248 bytes stack frame, 462 bytes spill stores, 614 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 210 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 190 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 144 bytes stack frame, 40 bytes spill stores, 96 bytes spill loads ptxas info : Used 255 registers ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 32 bytes stack frame, 60 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 40 bytes stack frame, 120 bytes spill stores, 76 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 32 bytes stack frame, 52 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 104 bytes spill stores, 64 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 202 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 184 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 187 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 64 bytes stack frame, 160 bytes spill stores, 140 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 72 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 185 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 204 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 40 bytes stack frame, 64 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 160 bytes stack frame, 120 bytes spill stores, 116 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 96 bytes stack frame, 176 bytes spill stores, 224 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 728 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 728 bytes spill stores, 1288 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 56 bytes stack frame, 84 bytes spill stores, 68 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 128 bytes stack frame, 178 bytes spill stores, 162 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 80 bytes stack frame, 184 bytes spill stores, 208 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 752 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES3_EELb1ELb0ELb1ELb0ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 768 bytes spill stores, 1172 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 180 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 209 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 16 bytes spill stores, 50 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 184 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 16 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 128 bytes stack frame, 212 bytes spill stores, 240 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 648 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES3_EELb1ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 658 bytes spill stores, 1138 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 144 bytes stack frame, 308 bytes spill stores, 376 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi128ELi32ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi128ELi32ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 152 bytes stack frame, 466 bytes spill stores, 450 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 220 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 208 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] [22/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim192_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.23.post1/build/temp.linux-x86_64-cpython-310/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim192_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 --generate-line-info -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 --generate-line-info -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 209 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 219 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 214 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 218 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 217 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 225 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 24 bytes stack frame, 20 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 215 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 217 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 223 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 223 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 209 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 214 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 218 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 218 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 225 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 228 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 24 bytes stack frame, 20 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] [23/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim192_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.23.post1/build/temp.linux-x86_64-cpython-310/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim192_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 --generate-line-info -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 --generate-line-info -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 209 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 214 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 218 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 218 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 225 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 32 bytes stack frame, 36 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 144 bytes stack frame, 56 bytes spill stores, 128 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 128 bytes stack frame, 94 bytes spill stores, 114 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 104 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 228 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 56 bytes stack frame, 48 bytes spill stores, 70 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 88 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 215 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 217 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 223 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 223 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 40 bytes stack frame, 40 bytes spill stores, 48 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 64 bytes stack frame, 68 bytes spill stores, 94 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 106 bytes spill stores, 138 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 96 bytes stack frame, 152 bytes spill stores, 152 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 64 bytes stack frame, 52 bytes spill stores, 86 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 200 bytes stack frame, 78 bytes spill stores, 164 bytes spill loads ptxas info : Used 255 registers ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 209 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 219 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 214 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 218 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 217 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 225 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 24 bytes stack frame, 20 bytes spill stores, 20 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 72 bytes spill stores, 84 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 128 bytes stack frame, 94 bytes spill stores, 114 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 104 bytes stack frame, 156 bytes spill stores, 156 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 64 bytes stack frame, 44 bytes spill stores, 60 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 88 bytes stack frame, 132 bytes spill stores, 132 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] [24/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim224_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.23.post1/build/temp.linux-x86_64-cpython-310/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim224_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 --generate-line-info -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 --generate-line-info -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 231 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 80 bytes stack frame, 88 bytes spill stores, 92 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 217 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 227 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 88 bytes stack frame, 92 bytes spill stores, 100 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 64 bytes stack frame, 84 bytes spill stores, 100 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 237 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 56 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 88 bytes stack frame, 88 bytes spill stores, 108 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 56 bytes stack frame, 80 bytes spill stores, 88 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 96 bytes stack frame, 100 bytes spill stores, 108 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 64 bytes stack frame, 72 bytes spill stores, 76 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 84 bytes spill stores, 96 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 237 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 218 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 80 bytes stack frame, 104 bytes spill stores, 100 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 217 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 229 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 32 bytes stack frame, 28 bytes spill stores, 28 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 80 bytes stack frame, 84 bytes spill stores, 92 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 32 bytes stack frame, 44 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 233 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 80 bytes stack frame, 112 bytes spill stores, 116 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 96 bytes stack frame, 116 bytes spill stores, 136 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 40 bytes stack frame, 44 bytes spill stores, 76 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 40 bytes stack frame, 52 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 88 bytes stack frame, 100 bytes spill stores, 108 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 56 bytes stack frame, 56 bytes spill stores, 64 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 88 bytes stack frame, 84 bytes spill stores, 92 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 136 bytes stack frame, 152 bytes spill stores, 172 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 32 bytes stack frame, 40 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 218 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 241 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 56 bytes stack frame, 80 bytes spill stores, 76 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 217 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 56 bytes stack frame, 56 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 80 bytes stack frame, 88 bytes spill stores, 84 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 229 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 56 bytes stack frame, 52 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 32 bytes stack frame, 44 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 231 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 72 bytes stack frame, 88 bytes spill stores, 92 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 80 bytes stack frame, 76 bytes spill stores, 76 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 40 bytes stack frame, 44 bytes spill stores, 76 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 56 bytes stack frame, 68 bytes spill stores, 68 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 56 bytes stack frame, 56 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 56 bytes stack frame, 56 bytes spill stores, 64 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 88 bytes stack frame, 96 bytes spill stores, 92 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 56 bytes stack frame, 56 bytes spill stores, 72 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 32 bytes stack frame, 40 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] [25/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim224_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.23.post1/build/temp.linux-x86_64-cpython-310/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim224_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 --generate-line-info -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 --generate-line-info -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 231 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 80 bytes stack frame, 88 bytes spill stores, 92 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 217 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 227 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 64 bytes stack frame, 62 bytes spill stores, 54 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 80 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 272 bytes stack frame, 136 bytes spill stores, 238 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 237 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 64 bytes stack frame, 52 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 56 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 192 bytes stack frame, 256 bytes spill stores, 292 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 148 bytes spill stores, 148 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 120 bytes stack frame, 220 bytes spill stores, 220 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 56 bytes stack frame, 80 bytes spill stores, 88 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 80 bytes stack frame, 82 bytes spill stores, 86 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 264 bytes spill stores, 264 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 64 bytes stack frame, 72 bytes spill stores, 76 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 96 bytes stack frame, 104 bytes spill stores, 116 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 237 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 276 bytes spill stores, 276 bytes spill loads ptxas info : Used 255 registers ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 218 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 80 bytes stack frame, 104 bytes spill stores, 100 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 217 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 88 bytes stack frame, 84 bytes spill stores, 84 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 96 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 229 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 88 bytes stack frame, 88 bytes spill stores, 80 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 264 bytes stack frame, 292 bytes spill stores, 280 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 104 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 224 bytes stack frame, 108 bytes spill stores, 204 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 233 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 48 bytes stack frame, 56 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 80 bytes stack frame, 112 bytes spill stores, 116 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 128 bytes stack frame, 200 bytes spill stores, 266 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 104 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 120 bytes stack frame, 184 bytes spill stores, 160 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 40 bytes stack frame, 52 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 48 bytes stack frame, 56 bytes spill stores, 60 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 200 bytes spill stores, 200 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 88 bytes stack frame, 84 bytes spill stores, 92 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 208 bytes stack frame, 246 bytes spill stores, 316 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 272 bytes spill stores, 272 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 218 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 241 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 56 bytes stack frame, 80 bytes spill stores, 76 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 217 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 56 bytes stack frame, 56 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 80 bytes stack frame, 88 bytes spill stores, 84 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 229 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 48 bytes stack frame, 58 bytes spill stores, 50 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 96 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 104 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 224 bytes stack frame, 108 bytes spill stores, 204 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 231 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 48 bytes stack frame, 44 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 72 bytes stack frame, 88 bytes spill stores, 92 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 88 bytes stack frame, 104 bytes spill stores, 136 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 104 bytes stack frame, 120 bytes spill stores, 120 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 120 bytes stack frame, 184 bytes spill stores, 160 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 56 bytes stack frame, 68 bytes spill stores, 68 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 40 bytes stack frame, 50 bytes spill stores, 42 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 200 bytes spill stores, 200 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 88 bytes stack frame, 96 bytes spill stores, 92 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 92 bytes spill stores, 108 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi224ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 272 bytes spill stores, 272 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] [26/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim256_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.23.post1/build/temp.linux-x86_64-cpython-310/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim256_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 --generate-line-info -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 --generate-line-info -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 247 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 247 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 245 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 251 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 216 bytes stack frame, 228 bytes spill stores, 224 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 24 bytes stack frame, 48 bytes spill stores, 48 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 120 bytes stack frame, 144 bytes spill stores, 128 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 88 bytes stack frame, 148 bytes spill stores, 116 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 408 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 72 bytes spill stores, 168 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 176 bytes spill stores, 152 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 56 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 56 bytes stack frame, 72 bytes spill stores, 104 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 128 bytes stack frame, 184 bytes spill stores, 156 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 376 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES3_EELb1ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 36 bytes spill stores, 80 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 192 bytes stack frame, 326 bytes spill stores, 304 bytes spill loads ptxas info : Used 255 registers ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 239 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 243 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 245 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 16 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 112 bytes stack frame, 132 bytes spill stores, 116 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 160 bytes stack frame, 236 bytes spill stores, 192 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 247 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 352 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 12 bytes spill stores, 16 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 112 bytes stack frame, 148 bytes spill stores, 124 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 152 bytes stack frame, 280 bytes spill stores, 288 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 48 bytes stack frame, 68 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 144 bytes stack frame, 224 bytes spill stores, 184 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 336 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES3_EELb1ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 72 bytes stack frame, 68 bytes spill stores, 84 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 48 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 239 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 243 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 245 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 128 bytes stack frame, 176 bytes spill stores, 152 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 112 bytes stack frame, 132 bytes spill stores, 116 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 160 bytes stack frame, 236 bytes spill stores, 192 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 247 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 24 bytes stack frame, 20 bytes spill stores, 20 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 376 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 36 bytes spill stores, 60 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 112 bytes stack frame, 148 bytes spill stores, 124 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 152 bytes stack frame, 280 bytes spill stores, 288 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 168 bytes stack frame, 220 bytes spill stores, 212 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 48 bytes stack frame, 68 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 144 bytes stack frame, 224 bytes spill stores, 184 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 344 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES3_EELb1ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 8 bytes spill stores, 16 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 72 bytes stack frame, 68 bytes spill stores, 84 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 48 bytes stack frame, 104 bytes spill stores, 104 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] [27/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim256_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.23.post1/build/temp.linux-x86_64-cpython-310/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim256_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 --generate-line-info -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 --generate-line-info -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 247 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 247 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 245 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 251 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 104 bytes stack frame, 152 bytes spill stores, 140 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 120 bytes stack frame, 240 bytes spill stores, 208 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 152 bytes stack frame, 114 bytes spill stores, 158 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 248 bytes stack frame, 224 bytes spill stores, 280 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 96 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 528 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 190 bytes spill stores, 278 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 168 bytes stack frame, 128 bytes spill stores, 152 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 128 bytes stack frame, 304 bytes spill stores, 244 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 112 bytes stack frame, 204 bytes spill stores, 204 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 264 bytes stack frame, 260 bytes spill stores, 310 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 488 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES3_EELb1ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 166 bytes spill stores, 250 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 144 bytes stack frame, 284 bytes spill stores, 220 bytes spill loads ptxas info : Used 255 registers ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 239 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 243 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 245 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 80 bytes stack frame, 124 bytes spill stores, 112 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 96 bytes stack frame, 244 bytes spill stores, 212 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 272 bytes stack frame, 96 bytes spill stores, 160 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 280 bytes stack frame, 252 bytes spill stores, 296 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 247 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 80 bytes stack frame, 66 bytes spill stores, 74 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 456 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 140 bytes spill stores, 176 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 96 bytes stack frame, 160 bytes spill stores, 128 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 112 bytes stack frame, 284 bytes spill stores, 212 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 136 bytes stack frame, 264 bytes spill stores, 248 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 48 bytes stack frame, 68 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 296 bytes stack frame, 236 bytes spill stores, 312 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 424 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES3_EELb1ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 108 bytes spill stores, 156 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 72 bytes stack frame, 68 bytes spill stores, 84 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 112 bytes stack frame, 280 bytes spill stores, 208 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 239 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 243 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 245 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 96 bytes stack frame, 140 bytes spill stores, 132 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 96 bytes stack frame, 236 bytes spill stores, 204 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 272 bytes stack frame, 96 bytes spill stores, 160 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 280 bytes stack frame, 252 bytes spill stores, 296 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 247 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 88 bytes stack frame, 74 bytes spill stores, 98 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 24 bytes stack frame, 20 bytes spill stores, 20 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 504 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 212 bytes spill stores, 284 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 96 bytes stack frame, 160 bytes spill stores, 128 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 112 bytes stack frame, 284 bytes spill stores, 212 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 136 bytes stack frame, 252 bytes spill stores, 244 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 48 bytes stack frame, 68 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 296 bytes stack frame, 236 bytes spill stores, 312 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 440 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES3_EELb1ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 144 bytes spill stores, 192 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 72 bytes stack frame, 68 bytes spill stores, 84 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi256ELi128ELi64ELi8ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi128ELi64ELi8ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 112 bytes stack frame, 280 bytes spill stores, 208 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] [28/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim32_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.23.post1/build/temp.linux-x86_64-cpython-310/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim32_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 --generate-line-info -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 --generate-line-info -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 32 bytes stack frame, 80 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 176 bytes stack frame, 228 bytes spill stores, 216 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 152 bytes stack frame, 168 bytes spill stores, 168 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 64 bytes stack frame, 72 bytes spill stores, 76 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 576 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 288 bytes spill stores, 352 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 112 bytes stack frame, 164 bytes spill stores, 148 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 304 bytes stack frame, 346 bytes spill stores, 328 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 264 bytes stack frame, 436 bytes spill stores, 408 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 736 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES3_EELb1ELb0ELb1ELb0ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 488 bytes spill stores, 464 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 72 bytes stack frame, 72 bytes spill stores, 72 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 632 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES3_EELb1ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 332 bytes spill stores, 380 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 128 bytes stack frame, 176 bytes spill stores, 168 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 136 bytes stack frame, 236 bytes spill stores, 224 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 632 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES3_EELb1ELb1ELb0ELb0ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 388 bytes spill stores, 392 bytes spill loads ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 40 bytes stack frame, 92 bytes spill stores, 64 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 40 bytes stack frame, 60 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 152 bytes stack frame, 216 bytes spill stores, 216 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 16 bytes stack frame, 16 bytes spill stores, 20 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 24 bytes stack frame, 36 bytes spill stores, 48 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 512 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 224 bytes spill stores, 268 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 40 bytes stack frame, 80 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 280 bytes stack frame, 316 bytes spill stores, 300 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 80 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 848 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES3_EELb1ELb0ELb1ELb0ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 632 bytes spill stores, 672 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 32 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 592 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES3_EELb1ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 340 bytes spill stores, 384 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 80 bytes stack frame, 116 bytes spill stores, 100 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 160 bytes stack frame, 288 bytes spill stores, 284 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 776 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES3_EELb1ELb1ELb0ELb0ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 612 bytes spill stores, 652 bytes spill loads ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 32 bytes stack frame, 80 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 176 bytes stack frame, 228 bytes spill stores, 216 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 112 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 72 bytes stack frame, 116 bytes spill stores, 120 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 496 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 188 bytes spill stores, 244 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 112 bytes stack frame, 164 bytes spill stores, 148 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 304 bytes stack frame, 346 bytes spill stores, 328 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 264 bytes stack frame, 436 bytes spill stores, 408 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 736 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES3_EELb1ELb0ELb1ELb0ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 488 bytes spill stores, 464 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 64 bytes stack frame, 64 bytes spill stores, 76 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 536 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES3_EELb1ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 228 bytes spill stores, 276 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 128 bytes stack frame, 176 bytes spill stores, 168 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 136 bytes stack frame, 236 bytes spill stores, 224 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 640 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES3_EELb1ELb1ELb0ELb0ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 392 bytes spill stores, 396 bytes spill loads [29/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim32_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.23.post1/build/temp.linux-x86_64-cpython-310/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim32_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 --generate-line-info -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 --generate-line-info -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 32 bytes stack frame, 80 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 176 bytes stack frame, 228 bytes spill stores, 216 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 696 bytes stack frame, 476 bytes spill stores, 544 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 64 bytes stack frame, 72 bytes spill stores, 76 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 1384 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 1048 bytes spill stores, 1330 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 112 bytes stack frame, 164 bytes spill stores, 148 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 728 bytes stack frame, 664 bytes spill stores, 636 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 264 bytes stack frame, 436 bytes spill stores, 408 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 1528 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES3_EELb1ELb0ELb1ELb0ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 1336 bytes spill stores, 1614 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 72 bytes stack frame, 72 bytes spill stores, 72 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 1408 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES3_EELb1ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 1044 bytes spill stores, 1272 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 128 bytes stack frame, 184 bytes spill stores, 176 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 136 bytes stack frame, 236 bytes spill stores, 224 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 776 bytes stack frame, 1024 bytes spill stores, 984 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 32 bytes stack frame, 80 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 176 bytes stack frame, 228 bytes spill stores, 216 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 688 bytes stack frame, 492 bytes spill stores, 574 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 72 bytes stack frame, 116 bytes spill stores, 120 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 1376 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 916 bytes spill stores, 1144 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 112 bytes stack frame, 164 bytes spill stores, 148 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 728 bytes stack frame, 664 bytes spill stores, 636 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 264 bytes stack frame, 436 bytes spill stores, 408 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 1528 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES3_EELb1ELb0ELb1ELb0ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 1340 bytes spill stores, 1618 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 64 bytes stack frame, 64 bytes spill stores, 76 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 1368 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES3_EELb1ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 1008 bytes spill stores, 1182 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 128 bytes stack frame, 184 bytes spill stores, 176 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 136 bytes stack frame, 236 bytes spill stores, 224 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 776 bytes stack frame, 1024 bytes spill stores, 984 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 40 bytes stack frame, 92 bytes spill stores, 64 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 40 bytes stack frame, 60 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 152 bytes stack frame, 216 bytes spill stores, 216 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 16 bytes stack frame, 16 bytes spill stores, 20 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 608 bytes stack frame, 384 bytes spill stores, 448 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 24 bytes stack frame, 36 bytes spill stores, 48 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 1416 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 1016 bytes spill stores, 1262 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 40 bytes stack frame, 80 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 720 bytes stack frame, 580 bytes spill stores, 552 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 80 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 1584 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES3_EELb1ELb0ELb1ELb0ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 1424 bytes spill stores, 1728 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 32 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 1400 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES3_EELb1ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 1128 bytes spill stores, 1322 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 80 bytes stack frame, 120 bytes spill stores, 96 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 160 bytes stack frame, 288 bytes spill stores, 284 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi32ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi128ELi128ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 784 bytes stack frame, 1024 bytes spill stores, 984 bytes spill loads ptxas info : Used 255 registers [30/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim64_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.23.post1/build/temp.linux-x86_64-cpython-310/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim64_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 --generate-line-info -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 --generate-line-info -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 40 bytes stack frame, 56 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 232 bytes stack frame, 360 bytes spill stores, 376 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 104 bytes stack frame, 152 bytes spill stores, 136 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 48 bytes stack frame, 92 bytes spill stores, 84 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 200 bytes stack frame, 340 bytes spill stores, 304 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 56 bytes stack frame, 96 bytes spill stores, 116 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 168 bytes stack frame, 244 bytes spill stores, 228 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 240 bytes stack frame, 364 bytes spill stores, 364 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 24 bytes stack frame, 24 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 40 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 104 bytes stack frame, 128 bytes spill stores, 124 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 184 bytes stack frame, 296 bytes spill stores, 288 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 152 bytes stack frame, 332 bytes spill stores, 324 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 40 bytes stack frame, 52 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 208 bytes stack frame, 340 bytes spill stores, 356 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 104 bytes stack frame, 152 bytes spill stores, 136 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 48 bytes stack frame, 92 bytes spill stores, 84 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 200 bytes stack frame, 340 bytes spill stores, 304 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 64 bytes stack frame, 100 bytes spill stores, 120 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 168 bytes stack frame, 244 bytes spill stores, 228 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 240 bytes stack frame, 364 bytes spill stores, 364 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 24 bytes stack frame, 24 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 32 bytes stack frame, 32 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 104 bytes stack frame, 128 bytes spill stores, 124 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 184 bytes stack frame, 296 bytes spill stores, 288 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 48 bytes stack frame, 44 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 152 bytes stack frame, 332 bytes spill stores, 324 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 40 bytes stack frame, 52 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 224 bytes stack frame, 368 bytes spill stores, 384 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 104 bytes stack frame, 152 bytes spill stores, 136 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 48 bytes stack frame, 92 bytes spill stores, 80 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 248 bytes stack frame, 412 bytes spill stores, 372 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 152 bytes stack frame, 192 bytes spill stores, 212 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 200 bytes stack frame, 264 bytes spill stores, 248 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 272 bytes stack frame, 448 bytes spill stores, 476 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 48 bytes stack frame, 44 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 48 bytes stack frame, 60 bytes spill stores, 64 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 104 bytes stack frame, 132 bytes spill stores, 128 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 176 bytes stack frame, 328 bytes spill stores, 348 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 24 bytes stack frame, 24 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 104 bytes stack frame, 208 bytes spill stores, 180 bytes spill loads ptxas info : Used 255 registers [31/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim64_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.23.post1/build/temp.linux-x86_64-cpython-310/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim64_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 --generate-line-info -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 --generate-line-info -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 40 bytes stack frame, 52 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 208 bytes stack frame, 340 bytes spill stores, 356 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 104 bytes stack frame, 152 bytes spill stores, 136 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 48 bytes stack frame, 92 bytes spill stores, 84 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 200 bytes stack frame, 340 bytes spill stores, 304 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 64 bytes stack frame, 100 bytes spill stores, 120 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 168 bytes stack frame, 244 bytes spill stores, 228 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 240 bytes stack frame, 364 bytes spill stores, 364 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 232 bytes stack frame, 176 bytes spill stores, 222 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 176 bytes stack frame, 196 bytes spill stores, 284 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 240 bytes stack frame, 348 bytes spill stores, 344 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 384 bytes stack frame, 424 bytes spill stores, 504 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 216 bytes stack frame, 238 bytes spill stores, 314 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 264 bytes stack frame, 556 bytes spill stores, 496 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 40 bytes stack frame, 52 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 224 bytes stack frame, 368 bytes spill stores, 384 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 104 bytes stack frame, 152 bytes spill stores, 136 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 48 bytes stack frame, 92 bytes spill stores, 80 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 248 bytes stack frame, 412 bytes spill stores, 372 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 152 bytes stack frame, 192 bytes spill stores, 212 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 200 bytes stack frame, 264 bytes spill stores, 248 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 272 bytes stack frame, 448 bytes spill stores, 476 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 208 bytes stack frame, 166 bytes spill stores, 174 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 192 bytes stack frame, 200 bytes spill stores, 284 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 200 bytes stack frame, 246 bytes spill stores, 226 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 424 bytes stack frame, 536 bytes spill stores, 610 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 176 bytes stack frame, 172 bytes spill stores, 256 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 176 bytes stack frame, 370 bytes spill stores, 318 bytes spill loads ptxas info : Used 255 registers ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 40 bytes stack frame, 56 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 232 bytes stack frame, 360 bytes spill stores, 376 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 104 bytes stack frame, 152 bytes spill stores, 136 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 48 bytes stack frame, 92 bytes spill stores, 84 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 200 bytes stack frame, 340 bytes spill stores, 304 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 56 bytes stack frame, 96 bytes spill stores, 116 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 168 bytes stack frame, 244 bytes spill stores, 228 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 240 bytes stack frame, 364 bytes spill stores, 364 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 232 bytes stack frame, 176 bytes spill stores, 222 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 184 bytes stack frame, 228 bytes spill stores, 286 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 240 bytes stack frame, 348 bytes spill stores, 344 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 384 bytes stack frame, 424 bytes spill stores, 504 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 160 bytes stack frame, 208 bytes spill stores, 276 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi64ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi64ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 264 bytes stack frame, 556 bytes spill stores, 496 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] [32/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim96_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.23.post1/build/temp.linux-x86_64-cpython-310/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim96_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 --generate-line-info -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 --generate-line-info -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 243 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 145 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 130 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 166 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 56 bytes stack frame, 52 bytes spill stores, 52 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 48 bytes stack frame, 140 bytes spill stores, 132 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 544 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 256 bytes spill stores, 464 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 120 bytes stack frame, 148 bytes spill stores, 144 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 64 bytes stack frame, 120 bytes spill stores, 96 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 520 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES3_EELb1ELb0ELb1ELb0ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 344 bytes spill stores, 404 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 480 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES3_EELb1ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 200 bytes spill stores, 368 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 160 bytes stack frame, 284 bytes spill stores, 268 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 150 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 174 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 160 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 158 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 200 registers ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 136 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 68 bytes spill stores, 76 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 40 bytes stack frame, 84 bytes spill stores, 100 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 400 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 72 bytes spill stores, 120 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 152 bytes stack frame, 196 bytes spill stores, 184 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 32 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 568 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES3_EELb1ELb0ELb1ELb0ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 316 bytes spill stores, 476 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 360 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES3_EELb1ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 24 bytes spill stores, 36 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 24 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 152 bytes stack frame, 296 bytes spill stores, 320 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 152 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 138 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 162 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 160 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 154 registers, 688 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 155 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 24 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 88 bytes stack frame, 104 bytes spill stores, 132 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 504 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 164 bytes spill stores, 276 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 152 bytes stack frame, 196 bytes spill stores, 184 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 32 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 568 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES3_EELb1ELb0ELb1ELb0ELb1ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 316 bytes spill stores, 476 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 16 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 496 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES3_EELb1ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 192 bytes spill stores, 280 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 24 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 152 bytes stack frame, 296 bytes spill stores, 320 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 167 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 178 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 162 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 160 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 154 registers, 688 bytes cmem[0] [33/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim96_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.23.post1/build/temp.linux-x86_64-cpython-310/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_hdim96_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 --generate-line-info -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 --generate-line-info -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 243 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 145 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 130 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 166 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 264 bytes stack frame, 214 bytes spill stores, 214 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 48 bytes stack frame, 140 bytes spill stores, 132 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 776 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 510 bytes spill stores, 726 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 408 bytes stack frame, 368 bytes spill stores, 360 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 64 bytes stack frame, 120 bytes spill stores, 96 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 408 bytes stack frame, 530 bytes spill stores, 608 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 680 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES3_EELb1ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 416 bytes spill stores, 560 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 16 bytes spill stores, 16 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 376 bytes stack frame, 460 bytes spill stores, 554 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 150 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 188 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 160 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 158 registers ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 214 registers ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 155 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 24 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 296 bytes stack frame, 222 bytes spill stores, 266 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 88 bytes stack frame, 104 bytes spill stores, 132 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 752 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 588 bytes spill stores, 800 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 344 bytes stack frame, 248 bytes spill stores, 320 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 32 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 488 bytes stack frame, 542 bytes spill stores, 644 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 16 bytes stack frame, 24 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 792 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES3_EELb1ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 612 bytes spill stores, 752 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 24 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 432 bytes stack frame, 464 bytes spill stores, 588 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 167 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 193 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 162 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 160 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 216 registers, 688 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 136 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 272 bytes stack frame, 194 bytes spill stores, 254 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb0EEv16Flash_fwd_params 40 bytes stack frame, 84 bytes spill stores, 100 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb0ELb1EEv16Flash_fwd_params 768 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES3_EELb1ELb0ELb1ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 524 bytes spill stores, 692 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 344 bytes stack frame, 248 bytes spill stores, 320 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb0EEv16Flash_fwd_params 32 bytes stack frame, 80 bytes spill stores, 80 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb0ELb1ELb0ELb1ELb1EEv16Flash_fwd_params 488 bytes stack frame, 542 bytes spill stores, 644 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 696 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22compute_attn_1rowblockI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES3_EELb1ELb1ELb0ELb0ELb0ELb1E16Flash_fwd_paramsEEvRKT6_iii 0 bytes stack frame, 416 bytes spill stores, 516 bytes spill loads ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 24 bytes stack frame, 40 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi128ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi128ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 432 bytes stack frame, 464 bytes spill stores, 588 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 152 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 190 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 162 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 160 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z16flash_fwd_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi64ELi4ES2_EELb1ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 216 registers, 688 bytes cmem[0] [34/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim128_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.23.post1/build/temp.linux-x86_64-cpython-310/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim128_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 --generate-line-info -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 --generate-line-info -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi7ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 2560 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi6ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 1280 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi5ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 640 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi4ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 320 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi3ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 160 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi2ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 80 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 48 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi7ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 2560 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi6ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 1280 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi5ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 640 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi4ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 320 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi3ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 160 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi2ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 80 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 48 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 227 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 227 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 231 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 1584 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_INSA_ILi16EEELi0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1B_IT3_T4_EERKNS1B_IT5_T6_EES1P_RKNS1B_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm64ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi16EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm64ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_S8_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJNS7_ILi8EEESB_EEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJNS7_ILi64EEENS7_ILi128EEEEEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSQ_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS1A_NS5_INS6_IJS1B_SQ_SN_EEENS6_IJS1D_NS7_ILi1024EEENS6_IJS1E_NS7_ILi8192EEEEEEEEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES21_S21_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SQ_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS10_SA_SB_EEENS6_IJNS6_IJSZ_SQ_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SB_EEEEENS24_IS27_NS5_INS6_IJS28_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSB_S10_EEENS6_IJSQ_SE_EEEEEEEEEEENS6_IJSB_SB_EEEEENS1_7ThrCopyIS2K_iEENS2W_IS2V_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 1584 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_INSA_ILi16EEELi0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1B_IT3_T4_EERKNS1B_IT5_T6_EES1P_RKNS1B_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm64ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi16EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm64ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_S8_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJNS7_ILi8EEESB_EEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJNS7_ILi64EEENS7_ILi128EEEEEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSQ_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS1A_NS5_INS6_IJS1B_SQ_SN_EEENS6_IJS1D_NS7_ILi1024EEENS6_IJS1E_NS7_ILi8192EEEEEEEEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES21_S21_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SQ_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS10_SA_SB_EEENS6_IJNS6_IJSZ_SQ_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SB_EEEEENS24_IS27_NS5_INS6_IJS28_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSB_S10_EEENS6_IJSQ_SE_EEEEEEEEEEENS6_IJSB_SB_EEEEENS1_7ThrCopyIS2K_iEENS2W_IS2V_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 229 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 1584 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 194 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_INSA_ILi16EEELi0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1B_IT3_T4_EERKNS1B_IT5_T6_EES1P_RKNS1B_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm64ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi16EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm64ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_S8_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJNS7_ILi8EEESB_EEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJNS7_ILi64EEENS7_ILi128EEEEEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSQ_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS1A_NS5_INS6_IJS1B_SQ_SN_EEENS6_IJS1D_NS7_ILi1024EEENS6_IJS1E_NS7_ILi8192EEEEEEEEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES21_S21_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SQ_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS10_SA_SB_EEENS6_IJNS6_IJSZ_SQ_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SB_EEEEENS24_IS27_NS5_INS6_IJS28_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSB_S10_EEENS6_IJSQ_SE_EEEEEEEEEEENS6_IJSB_SB_EEEEENS1_7ThrCopyIS2K_iEENS2W_IS2V_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 231 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 1584 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_INSA_ILi16EEELi0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1B_IT3_T4_EERKNS1B_IT5_T6_EES1P_RKNS1B_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm64ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi16EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm64ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_S8_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJNS7_ILi8EEESB_EEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJNS7_ILi64EEENS7_ILi128EEEEEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSQ_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS1A_NS5_INS6_IJS1B_SQ_SN_EEENS6_IJS1D_NS7_ILi1024EEENS6_IJS1E_NS7_ILi8192EEEEEEEEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES21_S21_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SQ_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS10_SA_SB_EEENS6_IJNS6_IJSZ_SQ_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SB_EEEEENS24_IS27_NS5_INS6_IJS28_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSB_S10_EEENS6_IJSQ_SE_EEEEEEEEEEENS6_IJSB_SB_EEEEENS1_7ThrCopyIS2K_iEENS2W_IS2V_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 235 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi7ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 25 registers, 2560 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi6ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 1280 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi5ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 21 registers, 640 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi4ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 21 registers, 320 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi3ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 21 registers, 160 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi2ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 21 registers, 80 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 21 registers, 48 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi7ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 25 registers, 2560 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi6ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 1280 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi5ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 640 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi4ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 320 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi3ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 160 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi2ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 80 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 48 bytes smem ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 227 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 228 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 243 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 1584 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_INSA_ILi16EEELi0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1B_IT3_T4_EERKNS1B_IT5_T6_EES1P_RKNS1B_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm64ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi16EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm64ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_S8_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJNS7_ILi8EEESB_EEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJNS7_ILi64EEENS7_ILi128EEEEEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSQ_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS1A_NS5_INS6_IJS1B_SQ_SN_EEENS6_IJS1D_NS7_ILi1024EEENS6_IJS1E_NS7_ILi8192EEEEEEEEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES21_S21_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SQ_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS10_SA_SB_EEENS6_IJNS6_IJSZ_SQ_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SB_EEEEENS24_IS27_NS5_INS6_IJS28_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSB_S10_EEENS6_IJSQ_SE_EEEEEEEEEEENS6_IJSB_SB_EEEEENS1_7ThrCopyIS2K_iEENS2W_IS2V_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 1584 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_INSA_ILi16EEELi0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1B_IT3_T4_EERKNS1B_IT5_T6_EES1P_RKNS1B_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm64ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi16EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm64ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_S8_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJNS7_ILi8EEESB_EEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJNS7_ILi64EEENS7_ILi128EEEEEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSQ_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS1A_NS5_INS6_IJS1B_SQ_SN_EEENS6_IJS1D_NS7_ILi1024EEENS6_IJS1E_NS7_ILi8192EEEEEEEEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES21_S21_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SQ_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS10_SA_SB_EEENS6_IJNS6_IJSZ_SQ_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SB_EEEEENS24_IS27_NS5_INS6_IJS28_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSB_S10_EEENS6_IJSQ_SE_EEEEEEEEEEENS6_IJSB_SB_EEEEENS1_7ThrCopyIS2K_iEENS2W_IS2V_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 239 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 1584 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 194 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_INSA_ILi16EEELi0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1B_IT3_T4_EERKNS1B_IT5_T6_EES1P_RKNS1B_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm64ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi16EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm64ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_S8_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJNS7_ILi8EEESB_EEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJNS7_ILi64EEENS7_ILi128EEEEEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSQ_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS1A_NS5_INS6_IJS1B_SQ_SN_EEENS6_IJS1D_NS7_ILi1024EEENS6_IJS1E_NS7_ILi8192EEEEEEEEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES21_S21_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SQ_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS10_SA_SB_EEENS6_IJNS6_IJSZ_SQ_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SB_EEEEENS24_IS27_NS5_INS6_IJS28_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSB_S10_EEENS6_IJSQ_SE_EEEEEEEEEEENS6_IJSB_SB_EEEEENS1_7ThrCopyIS2K_iEENS2W_IS2V_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 1600 bytes stack frame, 16 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_INSA_ILi16EEELi0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1B_IT3_T4_EERKNS1B_IT5_T6_EES1P_RKNS1B_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm64ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi16EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm64ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_S8_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJNS7_ILi8EEESB_EEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJNS7_ILi64EEENS7_ILi128EEEEEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSQ_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS1A_NS5_INS6_IJS1B_SQ_SN_EEENS6_IJS1D_NS7_ILi1024EEENS6_IJS1E_NS7_ILi8192EEEEEEEEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES21_S21_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SQ_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS10_SA_SB_EEENS6_IJNS6_IJSZ_SQ_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SB_EEEEENS24_IS27_NS5_INS6_IJS28_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSB_S10_EEENS6_IJSQ_SE_EEEEEEEEEEENS6_IJSB_SB_EEEEENS1_7ThrCopyIS2K_iEENS2W_IS2V_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 239 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 239 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi7ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 2560 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi6ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 1280 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi5ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 640 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi4ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 320 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi3ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 160 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi2ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 80 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 48 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi7ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 2560 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi6ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 1280 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi5ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 640 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi4ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 320 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi3ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 160 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi2ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 80 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 48 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 227 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 227 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 231 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 1584 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_INSA_ILi16EEELi0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1B_IT3_T4_EERKNS1B_IT5_T6_EES1P_RKNS1B_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm64ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi16EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm64ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_S8_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJNS7_ILi8EEESB_EEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJNS7_ILi64EEENS7_ILi128EEEEEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSQ_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS1A_NS5_INS6_IJS1B_SQ_SN_EEENS6_IJS1D_NS7_ILi1024EEENS6_IJS1E_NS7_ILi8192EEEEEEEEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES21_S21_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SQ_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS10_SA_SB_EEENS6_IJNS6_IJSZ_SQ_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SB_EEEEENS24_IS27_NS5_INS6_IJS28_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSB_S10_EEENS6_IJSQ_SE_EEEEEEEEEEENS6_IJSB_SB_EEEEENS1_7ThrCopyIS2K_iEENS2W_IS2V_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 1584 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_INSA_ILi16EEELi0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1B_IT3_T4_EERKNS1B_IT5_T6_EES1P_RKNS1B_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm64ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi16EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm64ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_S8_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJNS7_ILi8EEESB_EEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJNS7_ILi64EEENS7_ILi128EEEEEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSQ_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS1A_NS5_INS6_IJS1B_SQ_SN_EEENS6_IJS1D_NS7_ILi1024EEENS6_IJS1E_NS7_ILi8192EEEEEEEEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES21_S21_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SQ_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS10_SA_SB_EEENS6_IJNS6_IJSZ_SQ_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SB_EEEEENS24_IS27_NS5_INS6_IJS28_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSB_S10_EEENS6_IJSQ_SE_EEEEEEEEEEENS6_IJSB_SB_EEEEENS1_7ThrCopyIS2K_iEENS2W_IS2V_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 227 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 1584 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 194 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_INSA_ILi16EEELi0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1B_IT3_T4_EERKNS1B_IT5_T6_EES1P_RKNS1B_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm64ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi16EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm64ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_S8_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJNS7_ILi8EEESB_EEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJNS7_ILi64EEENS7_ILi128EEEEEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSQ_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS1A_NS5_INS6_IJS1B_SQ_SN_EEENS6_IJS1D_NS7_ILi1024EEENS6_IJS1E_NS7_ILi8192EEEEEEEEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES21_S21_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SQ_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS10_SA_SB_EEENS6_IJNS6_IJSZ_SQ_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SB_EEEEENS24_IS27_NS5_INS6_IJS28_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSB_S10_EEENS6_IJSQ_SE_EEEEEEEEEEENS6_IJSB_SB_EEEEENS1_7ThrCopyIS2K_iEENS2W_IS2V_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 231 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 1584 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_INSA_ILi16EEELi0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1B_IT3_T4_EERKNS1B_IT5_T6_EES1P_RKNS1B_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm64ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi16EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm64ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_S8_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJNS7_ILi8EEESB_EEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJNS7_ILi64EEENS7_ILi128EEEEEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSQ_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS1A_NS5_INS6_IJS1B_SQ_SN_EEENS6_IJS1D_NS7_ILi1024EEENS6_IJS1E_NS7_ILi8192EEEEEEEEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES21_S21_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SQ_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS10_SA_SB_EEENS6_IJNS6_IJSZ_SQ_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SB_EEEEENS24_IS27_NS5_INS6_IJS28_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSB_S10_EEENS6_IJSQ_SE_EEEEEEEEEEENS6_IJSB_SB_EEEEENS1_7ThrCopyIS2K_iEENS2W_IS2V_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 235 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads [35/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim128_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.23.post1/build/temp.linux-x86_64-cpython-310/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim128_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 --generate-line-info -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 --generate-line-info -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi7ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 25 registers, 2560 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi6ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 1280 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi5ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 21 registers, 640 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi4ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 21 registers, 320 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi3ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 21 registers, 160 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi2ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 21 registers, 80 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 21 registers, 48 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi7ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 25 registers, 2560 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi6ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 1280 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi5ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 640 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi4ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 320 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi3ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 160 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi2ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 80 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 48 bytes smem ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 235 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 228 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 231 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 243 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 1520 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm64ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi16EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm64ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_S8_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJNS7_ILi8EEESB_EEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJNS7_ILi64EEENS7_ILi128EEEEEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSQ_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS1A_NS5_INS6_IJS1B_SQ_SN_EEENS6_IJS1D_NS7_ILi1024EEENS6_IJS1E_NS7_ILi8192EEEEEEEEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES21_S21_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SQ_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS10_SA_SB_EEENS6_IJNS6_IJSZ_SQ_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SB_EEEEENS24_IS27_NS5_INS6_IJS28_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSB_S10_EEENS6_IJSQ_SE_EEEEEEEEEEENS6_IJSB_SB_EEEEENS1_7ThrCopyIS2K_iEENS2W_IS2V_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 1520 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm64ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi16EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm64ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_S8_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJNS7_ILi8EEESB_EEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJNS7_ILi64EEENS7_ILi128EEEEEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSQ_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS1A_NS5_INS6_IJS1B_SQ_SN_EEENS6_IJS1D_NS7_ILi1024EEENS6_IJS1E_NS7_ILi8192EEEEEEEEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES21_S21_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SQ_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS10_SA_SB_EEENS6_IJNS6_IJSZ_SQ_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SB_EEEEENS24_IS27_NS5_INS6_IJS28_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSB_S10_EEENS6_IJSQ_SE_EEEEEEEEEEENS6_IJSB_SB_EEEEENS1_7ThrCopyIS2K_iEENS2W_IS2V_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 239 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 1520 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 194 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm64ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi16EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm64ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_S8_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJNS7_ILi8EEESB_EEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJNS7_ILi64EEENS7_ILi128EEEEEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSQ_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS1A_NS5_INS6_IJS1B_SQ_SN_EEENS6_IJS1D_NS7_ILi1024EEENS6_IJS1E_NS7_ILi8192EEEEEEEEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES21_S21_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SQ_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS10_SA_SB_EEENS6_IJNS6_IJSZ_SQ_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SB_EEEEENS24_IS27_NS5_INS6_IJS28_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSB_S10_EEENS6_IJSQ_SE_EEEEEEEEEEENS6_IJSB_SB_EEEEENS1_7ThrCopyIS2K_iEENS2W_IS2V_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 1536 bytes stack frame, 16 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm64ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi16EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm64ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_S8_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJNS7_ILi8EEESB_EEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJNS7_ILi64EEENS7_ILi128EEEEEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSQ_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS1A_NS5_INS6_IJS1B_SQ_SN_EEENS6_IJS1D_NS7_ILi1024EEENS6_IJS1E_NS7_ILi8192EEEEEEEEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES21_S21_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SQ_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS10_SA_SB_EEENS6_IJNS6_IJSZ_SQ_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SB_EEEEENS24_IS27_NS5_INS6_IJS28_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSB_S10_EEENS6_IJSQ_SE_EEEEEEEEEEENS6_IJSB_SB_EEEEENS1_7ThrCopyIS2K_iEENS2W_IS2V_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 8 bytes stack frame, 8 bytes spill stores, 8 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi7ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 2560 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi6ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 1280 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi5ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 640 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi4ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 320 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi3ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 160 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi2ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 80 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 48 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi7ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 2560 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi6ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 1280 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi5ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 640 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi4ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 320 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi3ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 160 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi2ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 80 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 48 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 237 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 231 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 1520 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm64ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi16EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm64ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_S8_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJNS7_ILi8EEESB_EEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJNS7_ILi64EEENS7_ILi128EEEEEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSQ_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS1A_NS5_INS6_IJS1B_SQ_SN_EEENS6_IJS1D_NS7_ILi1024EEENS6_IJS1E_NS7_ILi8192EEEEEEEEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES21_S21_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SQ_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS10_SA_SB_EEENS6_IJNS6_IJSZ_SQ_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SB_EEEEENS24_IS27_NS5_INS6_IJS28_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSB_S10_EEENS6_IJSQ_SE_EEEEEEEEEEENS6_IJSB_SB_EEEEENS1_7ThrCopyIS2K_iEENS2W_IS2V_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 1520 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm64ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi16EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm64ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_S8_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJNS7_ILi8EEESB_EEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJNS7_ILi64EEENS7_ILi128EEEEEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSQ_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS1A_NS5_INS6_IJS1B_SQ_SN_EEENS6_IJS1D_NS7_ILi1024EEENS6_IJS1E_NS7_ILi8192EEEEEEEEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES21_S21_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SQ_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS10_SA_SB_EEENS6_IJNS6_IJSZ_SQ_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SB_EEEEENS24_IS27_NS5_INS6_IJS28_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSB_S10_EEENS6_IJSQ_SE_EEEEEEEEEEENS6_IJSB_SB_EEEEENS1_7ThrCopyIS2K_iEENS2W_IS2V_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 227 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 1520 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 194 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm64ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi16EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm64ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_S8_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJNS7_ILi8EEESB_EEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJNS7_ILi64EEENS7_ILi128EEEEEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSQ_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS1A_NS5_INS6_IJS1B_SQ_SN_EEENS6_IJS1D_NS7_ILi1024EEENS6_IJS1E_NS7_ILi8192EEEEEEEEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES21_S21_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SQ_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS10_SA_SB_EEENS6_IJNS6_IJSZ_SQ_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SB_EEEEENS24_IS27_NS5_INS6_IJS28_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSB_S10_EEENS6_IJSQ_SE_EEEEEEEEEEENS6_IJSB_SB_EEEEENS1_7ThrCopyIS2K_iEENS2W_IS2V_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 231 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 1520 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm64ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi16EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm64ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_S8_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJNS7_ILi8EEESB_EEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJNS7_ILi64EEENS7_ILi128EEEEEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSQ_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS1A_NS5_INS6_IJS1B_SQ_SN_EEENS6_IJS1D_NS7_ILi1024EEENS6_IJS1E_NS7_ILi8192EEEEEEEEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES21_S21_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SQ_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS10_SA_SB_EEENS6_IJNS6_IJSZ_SQ_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SB_EEEEENS24_IS27_NS5_INS6_IJS28_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSB_S10_EEENS6_IJSQ_SE_EEEEEEEEEEENS6_IJSB_SB_EEEEENS1_7ThrCopyIS2K_iEENS2W_IS2V_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi7ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 2560 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi6ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 1280 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi5ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 640 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi4ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 320 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi3ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 160 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi2ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 80 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 48 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi7ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 2560 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi6ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 1280 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi5ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 640 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi4ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 320 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi3ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 160 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi2ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 80 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELi4ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 48 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 237 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 231 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 1520 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm64ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi16EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm64ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_S8_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJNS7_ILi8EEESB_EEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJNS7_ILi64EEENS7_ILi128EEEEEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSQ_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS1A_NS5_INS6_IJS1B_SQ_SN_EEENS6_IJS1D_NS7_ILi1024EEENS6_IJS1E_NS7_ILi8192EEEEEEEEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES21_S21_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SQ_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS10_SA_SB_EEENS6_IJNS6_IJSZ_SQ_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SB_EEEEENS24_IS27_NS5_INS6_IJS28_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSB_S10_EEENS6_IJSQ_SE_EEEEEEEEEEENS6_IJSB_SB_EEEEENS1_7ThrCopyIS2K_iEENS2W_IS2V_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 1520 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm64ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi16EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm64ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_S8_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJNS7_ILi8EEESB_EEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJNS7_ILi64EEENS7_ILi128EEEEEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSQ_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS1A_NS5_INS6_IJS1B_SQ_SN_EEENS6_IJS1D_NS7_ILi1024EEENS6_IJS1E_NS7_ILi8192EEEEEEEEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES21_S21_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SQ_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS10_SA_SB_EEENS6_IJNS6_IJSZ_SQ_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SB_EEEEENS24_IS27_NS5_INS6_IJS28_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSB_S10_EEENS6_IJSQ_SE_EEEEEEEEEEENS6_IJSB_SB_EEEEENS1_7ThrCopyIS2K_iEENS2W_IS2V_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 229 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 1520 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 194 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm64ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi16EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm64ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_S8_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJNS7_ILi8EEESB_EEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJNS7_ILi64EEENS7_ILi128EEEEEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSQ_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS1A_NS5_INS6_IJS1B_SQ_SN_EEENS6_IJS1D_NS7_ILi1024EEENS6_IJS1E_NS7_ILi8192EEEEEEEEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES21_S21_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SQ_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS10_SA_SB_EEENS6_IJNS6_IJSZ_SQ_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SB_EEEEENS24_IS27_NS5_INS6_IJS28_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSB_S10_EEENS6_IJSQ_SE_EEEEEEEEEEENS6_IJSB_SB_EEEEENS1_7ThrCopyIS2K_iEENS2W_IS2V_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 231 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 1520 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEESB_NSA_ILi2EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESB_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm64ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi16EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm64ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_S8_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJNS7_ILi8EEESB_EEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJNS7_ILi64EEENS7_ILi128EEEEEENS7_ILi256EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSQ_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS1A_NS5_INS6_IJS1B_SQ_SN_EEENS6_IJS1D_NS7_ILi1024EEENS6_IJS1E_NS7_ILi8192EEEEEEEEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES21_S21_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SQ_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS10_SA_SB_EEENS6_IJNS6_IJSZ_SQ_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SB_EEEEENS24_IS27_NS5_INS6_IJS28_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSB_S10_EEENS6_IJSQ_SE_EEEEEEEEEEENS6_IJSB_SB_EEEEENS1_7ThrCopyIS2K_iEENS2W_IS2V_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi128ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi128ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] [36/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim160_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.23.post1/build/temp.linux-x86_64-cpython-310/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim160_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 --generate-line-info -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 --generate-line-info -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 52 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 54 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 53 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 53 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 53 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 52 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 52 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 52 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 171 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 169 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 171 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 171 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 197 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 187 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi5EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 209 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 185 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi5EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 186 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 198 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 186 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 200 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 192 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 188 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 194 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 186 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 174 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 179 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi5EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 191 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 175 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi5EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 190 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 196 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 208 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 188 registers, 688 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 54 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 53 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 53 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 53 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 52 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 52 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 52 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 170 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 171 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 173 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 197 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 199 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi5EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 209 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 180 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi5EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 186 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 198 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 186 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 200 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 192 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 188 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 194 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 186 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 176 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 183 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi5EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 211 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 167 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi5EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 190 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 196 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 208 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 188 registers, 688 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 52 registers, 8704 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 52 registers, 4352 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 53 registers, 2176 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 53 registers, 1088 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 53 registers, 544 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 53 registers, 272 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 53 registers, 144 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 53 registers, 8704 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 53 registers, 4352 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 53 registers, 2176 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 53 registers, 1088 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 52 registers, 544 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 52 registers, 272 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 52 registers, 144 bytes smem ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 171 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 171 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 170 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 174 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 193 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 184 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi5EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 213 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 176 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi5EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 186 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 198 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 186 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 200 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 192 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 196 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 194 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 186 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 188 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 223 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi5EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 189 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 223 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi5EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 190 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 198 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 199 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 200 registers [37/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim160_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.23.post1/build/temp.linux-x86_64-cpython-310/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim160_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 --generate-line-info -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 --generate-line-info -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 52 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 54 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 53 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 53 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 53 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 52 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 52 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 52 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 171 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 169 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 171 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 171 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 197 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 191 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 209 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 193 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 186 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 198 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 186 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 200 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 192 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 188 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 194 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 186 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 174 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 175 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 191 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 183 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 190 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 196 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 208 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 188 registers, 688 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 56 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 54 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 53 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 53 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 53 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 52 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 52 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 52 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 170 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 171 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 173 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 197 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 175 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 209 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 194 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 186 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 198 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 186 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 200 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 192 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 188 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 194 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 186 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 176 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 177 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 211 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 201 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 190 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 196 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 208 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 188 registers, 688 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 52 registers, 8704 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 52 registers, 4352 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 53 registers, 2176 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 53 registers, 1088 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 53 registers, 544 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 53 registers, 272 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 53 registers, 144 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 53 registers, 8704 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 53 registers, 4352 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 53 registers, 2176 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 53 registers, 1088 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 52 registers, 544 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 52 registers, 272 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 52 registers, 144 bytes smem ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 171 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 171 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 170 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 174 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 193 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 189 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 213 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 194 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 186 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 198 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 186 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 200 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 192 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 196 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 194 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 186 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 188 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 183 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 189 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 180 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 190 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 198 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 199 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi160ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi160ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 200 registers [38/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim192_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.23.post1/build/temp.linux-x86_64-cpython-310/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim192_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 --generate-line-info -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 --generate-line-info -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi7ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 41 registers, 4608 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi6ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 41 registers, 2304 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi5ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 41 registers, 1152 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi4ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 576 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi3ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 39 registers, 288 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi2ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 39 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 39 registers, 80 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi7ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 4608 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi6ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 2304 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi5ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 36 registers, 1152 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi4ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 576 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi3ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 288 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi2ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 80 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 212 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 211 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_INSA_ILi16EEELi0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS14_IT3_T4_EERKNS14_IT5_T6_EES1I_RKNS14_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 212 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 211 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_INSA_ILi16EEELi0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS14_IT3_T4_EERKNS14_IT5_T6_EES1I_RKNS14_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 216 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 225 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_INSA_ILi16EEELi0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS14_IT3_T4_EERKNS14_IT5_T6_EES1I_RKNS14_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_INSA_ILi16EEELi0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1B_IT3_T4_EERKNS1B_IT5_T6_EES1P_RKNS1B_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 216 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 218 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_INSA_ILi16EEELi0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1B_IT3_T4_EERKNS1B_IT5_T6_EES1P_RKNS1B_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 228 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_INSA_ILi16EEELi0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS14_IT3_T4_EERKNS14_IT5_T6_EES1I_RKNS14_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_INSA_ILi16EEELi0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS14_IT3_T4_EERKNS14_IT5_T6_EES1I_RKNS14_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_INSA_ILi16EEELi0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1B_IT3_T4_EERKNS1B_IT5_T6_EES1P_RKNS1B_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 225 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_INSA_ILi16EEELi0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS14_IT3_T4_EERKNS14_IT5_T6_EES1I_RKNS14_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_INSA_ILi16EEELi0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS14_IT3_T4_EERKNS14_IT5_T6_EES1I_RKNS14_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 228 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_INSA_ILi16EEELi0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS14_IT3_T4_EERKNS14_IT5_T6_EES1I_RKNS14_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi7ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 4608 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi6ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 2304 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi5ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 1152 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi4ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 576 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi3ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 37 registers, 288 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi2ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 37 registers, 144 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 37 registers, 80 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi7ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 4608 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi6ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 2304 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi5ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 1152 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi4ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 576 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi3ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 288 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi2ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 144 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 80 bytes smem ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 215 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 215 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_INSA_ILi16EEELi0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS14_IT3_T4_EERKNS14_IT5_T6_EES1I_RKNS14_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 214 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 221 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_INSA_ILi16EEELi0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS14_IT3_T4_EERKNS14_IT5_T6_EES1I_RKNS14_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 218 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 223 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_INSA_ILi16EEELi0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS14_IT3_T4_EERKNS14_IT5_T6_EES1I_RKNS14_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_INSA_ILi16EEELi0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1B_IT3_T4_EERKNS1B_IT5_T6_EES1P_RKNS1B_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 219 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_INSA_ILi16EEELi0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1B_IT3_T4_EERKNS1B_IT5_T6_EES1P_RKNS1B_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_INSA_ILi16EEELi0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS14_IT3_T4_EERKNS14_IT5_T6_EES1I_RKNS14_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_INSA_ILi16EEELi0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS14_IT3_T4_EERKNS14_IT5_T6_EES1I_RKNS14_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_INSA_ILi16EEELi0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1B_IT3_T4_EERKNS1B_IT5_T6_EES1P_RKNS1B_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 237 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 225 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_INSA_ILi16EEELi0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS14_IT3_T4_EERKNS14_IT5_T6_EES1I_RKNS14_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 228 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_INSA_ILi16EEELi0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS14_IT3_T4_EERKNS14_IT5_T6_EES1I_RKNS14_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 228 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_INSA_ILi16EEELi0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS14_IT3_T4_EERKNS14_IT5_T6_EES1I_RKNS14_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi7ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 39 registers, 4608 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi6ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 39 registers, 2304 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi5ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 39 registers, 1152 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi4ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 39 registers, 576 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi3ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 288 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi2ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 80 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi7ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 4608 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi6ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 2304 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi5ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 1152 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi4ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 576 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi3ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 288 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi2ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 80 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 212 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 211 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_INSA_ILi16EEELi0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS14_IT3_T4_EERKNS14_IT5_T6_EES1I_RKNS14_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 212 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 211 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_INSA_ILi16EEELi0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS14_IT3_T4_EERKNS14_IT5_T6_EES1I_RKNS14_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 229 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_INSA_ILi16EEELi0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS14_IT3_T4_EERKNS14_IT5_T6_EES1I_RKNS14_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_INSA_ILi16EEELi0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1B_IT3_T4_EERKNS1B_IT5_T6_EES1P_RKNS1B_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_INSA_ILi16EEELi0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1B_IT3_T4_EERKNS1B_IT5_T6_EES1P_RKNS1B_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_INSA_ILi16EEELi0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS14_IT3_T4_EERKNS14_IT5_T6_EES1I_RKNS14_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_INSA_ILi16EEELi0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS14_IT3_T4_EERKNS14_IT5_T6_EES1I_RKNS14_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 64 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_INSA_ILi16EEELi0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1B_IT3_T4_EERKNS1B_IT5_T6_EES1P_RKNS1B_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_INSA_ILi16EEELi0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS14_IT3_T4_EERKNS14_IT5_T6_EES1I_RKNS14_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 228 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_INSA_ILi16EEELi0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS14_IT3_T4_EERKNS14_IT5_T6_EES1I_RKNS14_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 228 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_INSA_ILi16EEELi0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS14_IT3_T4_EERKNS14_IT5_T6_EES1I_RKNS14_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads [39/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim192_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.23.post1/build/temp.linux-x86_64-cpython-310/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim192_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 --generate-line-info -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 --generate-line-info -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi7ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 4608 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi6ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 2304 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi5ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 1152 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi4ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 576 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi3ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 37 registers, 288 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi2ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 37 registers, 144 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 37 registers, 80 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi7ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 4608 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi6ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 2304 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi5ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 1152 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi4ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 576 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi3ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 288 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi2ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 144 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 80 bytes smem ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 215 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 217 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 214 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 217 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 218 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 229 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 219 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 225 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 237 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 225 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi7ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 41 registers, 4608 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi6ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 41 registers, 2304 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi5ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 41 registers, 1152 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi4ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 576 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi3ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 39 registers, 288 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi2ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 39 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 39 registers, 80 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi7ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 4608 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi6ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 2304 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi5ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 36 registers, 1152 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi4ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 576 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi3ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 288 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi2ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 80 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 212 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 213 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 212 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 213 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 216 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 220 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 216 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 220 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 225 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 228 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi7ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 39 registers, 4608 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi6ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 39 registers, 2304 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi5ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 39 registers, 1152 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi4ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 39 registers, 576 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi3ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 288 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi2ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 80 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi7ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 4608 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi6ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 2304 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi5ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 1152 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi4ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 576 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi3ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 288 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi2ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELi8ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 80 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 212 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 213 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 212 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 213 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 228 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 228 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 228 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi192ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi192ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] [40/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim224_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.23.post1/build/temp.linux-x86_64-cpython-310/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim224_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 --generate-line-info -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 --generate-line-info -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 72 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 70 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 67 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 67 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 67 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 221 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 214 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash23copy_rotary_interleavedILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi2048EEEEEEEES7_NS8_INS9_IJNS9_IJNSA_ILi4EEESC_EEESE_SF_EEENS9_IJSI_iNSA_ILi16EEEEEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS14_ISJ_Li0EEENS14_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1B_IT3_T4_EERKNS1B_IT5_T6_EES1P_RKNS1B_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 219 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 214 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash23copy_rotary_interleavedILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi2048EEEEEEEES7_NS8_INS9_IJNS9_IJNSA_ILi4EEESC_EEESE_SF_EEENS9_IJSI_iNSA_ILi16EEEEEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS14_ISJ_Li0EEENS14_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1B_IT3_T4_EERKNS1B_IT5_T6_EES1P_RKNS1B_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi2048EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_ISJ_Li0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi2048EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_ISJ_Li0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi2048EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_ISJ_Li0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi2048EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_ISJ_Li0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 232 bytes stack frame, 96 bytes spill stores, 96 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi2048EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_ISJ_Li0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash23copy_rotary_interleavedILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi2048EEEEEEEES7_NS8_INS9_IJNS9_IJNSA_ILi4EEESC_EEESE_SF_EEENS9_IJSI_iNSA_ILi16EEEEEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS14_ISJ_Li0EEENS14_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1B_IT3_T4_EERKNS1B_IT5_T6_EES1P_RKNS1B_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi2048EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_ISJ_Li0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 77 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 77 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 77 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 77 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 76 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 76 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 76 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 69 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 69 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 220 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 212 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash23copy_rotary_interleavedILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi2048EEEEEEEES7_NS8_INS9_IJNS9_IJNSA_ILi4EEESC_EEESE_SF_EEENS9_IJSI_iNSA_ILi16EEEEEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS14_ISJ_Li0EEENS14_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1B_IT3_T4_EERKNS1B_IT5_T6_EES1P_RKNS1B_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 217 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 214 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash23copy_rotary_interleavedILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi2048EEEEEEEES7_NS8_INS9_IJNS9_IJNSA_ILi4EEESC_EEESE_SF_EEENS9_IJSI_iNSA_ILi16EEEEEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS14_ISJ_Li0EEENS14_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1B_IT3_T4_EERKNS1B_IT5_T6_EES1P_RKNS1B_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 227 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi2048EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_ISJ_Li0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 241 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi2048EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_ISJ_Li0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi2048EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_ISJ_Li0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi2048EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_ISJ_Li0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 232 bytes stack frame, 92 bytes spill stores, 92 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi2048EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_ISJ_Li0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 88 bytes stack frame, 104 bytes spill stores, 112 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 229 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash23copy_rotary_interleavedILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi2048EEEEEEEES7_NS8_INS9_IJNS9_IJNSA_ILi4EEESC_EEESE_SF_EEENS9_IJSI_iNSA_ILi16EEEEEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS14_ISJ_Li0EEENS14_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1B_IT3_T4_EERKNS1B_IT5_T6_EES1P_RKNS1B_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi2048EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_ISJ_Li0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 67 registers, 8704 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 67 registers, 4352 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 67 registers, 2176 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 67 registers, 1088 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 69 registers, 544 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 69 registers, 272 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 69 registers, 144 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 69 registers, 8704 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 69 registers, 4352 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 2176 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 1088 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 544 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 272 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 144 bytes smem ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 233 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 217 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash23copy_rotary_interleavedILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi2048EEEEEEEES7_NS8_INS9_IJNS9_IJNSA_ILi4EEESC_EEESE_SF_EEENS9_IJSI_iNSA_ILi16EEEEEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS14_ISJ_Li0EEENS14_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1B_IT3_T4_EERKNS1B_IT5_T6_EES1P_RKNS1B_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 219 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash23copy_rotary_interleavedILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi2048EEEEEEEES7_NS8_INS9_IJNS9_IJNSA_ILi4EEESC_EEESE_SF_EEENS9_IJSI_iNSA_ILi16EEEEEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS14_ISJ_Li0EEENS14_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1B_IT3_T4_EERKNS1B_IT5_T6_EES1P_RKNS1B_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi2048EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_ISJ_Li0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 243 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi2048EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_ISJ_Li0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi2048EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_ISJ_Li0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi2048EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_ISJ_Li0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 208 bytes stack frame, 68 bytes spill stores, 68 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi2048EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_ISJ_Li0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 104 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 227 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash23copy_rotary_interleavedILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi2048EEEEEEEES7_NS8_INS9_IJNS9_IJNSA_ILi4EEESC_EEESE_SF_EEENS9_IJSI_iNSA_ILi16EEEEEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS14_ISJ_Li0EEENS14_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1B_IT3_T4_EERKNS1B_IT5_T6_EES1P_RKNS1B_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISG_NS9_IJSI_NSA_ILi1024EEENSA_ILi2048EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENS13_ISJ_Li0EEENS13_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads [41/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim224_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.23.post1/build/temp.linux-x86_64-cpython-310/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim224_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 --generate-line-info -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 --generate-line-info -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 72 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 70 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 67 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 67 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 67 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 221 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 215 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 219 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 214 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 223 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 112 bytes stack frame, 112 bytes spill stores, 112 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 144 bytes stack frame, 68 bytes spill stores, 68 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 128 bytes stack frame, 124 bytes spill stores, 124 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers, 688 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 67 registers, 8704 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 67 registers, 4352 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 67 registers, 2176 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 67 registers, 1088 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 69 registers, 544 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 69 registers, 272 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 69 registers, 144 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 69 registers, 8704 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 69 registers, 4352 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 2176 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 1088 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 544 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 272 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 144 bytes smem ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 233 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 219 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 243 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 220 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 88 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 144 bytes stack frame, 68 bytes spill stores, 68 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 104 bytes stack frame, 108 bytes spill stores, 108 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 227 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 77 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 77 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 77 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 77 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 76 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 76 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 76 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 69 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 69 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 68 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 220 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 214 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 217 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 216 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 227 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 223 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 241 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 120 bytes stack frame, 116 bytes spill stores, 116 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 160 bytes stack frame, 88 bytes spill stores, 88 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 88 bytes stack frame, 104 bytes spill stores, 112 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 234 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi2EEENSA_ILi7EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 232 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi224ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi224ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers, 688 bytes cmem[0] [42/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim256_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.23.post1/build/temp.linux-x86_64-cpython-310/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim256_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 --generate-line-info -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 --generate-line-info -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi7ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 2560 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi6ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 1280 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi5ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 31 registers, 640 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi4ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 31 registers, 320 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi3ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 31 registers, 160 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi2ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 31 registers, 80 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 31 registers, 48 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi7ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 34 registers, 2560 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi6ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 34 registers, 1280 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi5ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 34 registers, 640 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi4ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 33 registers, 320 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi3ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 33 registers, 160 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi2ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 33 registers, 80 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 33 registers, 48 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 241 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 228 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 239 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 231 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 1632 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash23copy_rotary_interleavedILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSE_SC_EEESE_SE_EEENS9_IJSH_iNSA_ILi32EEEEEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm32ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi8EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm128ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_SF_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJSB_NS7_ILi16EEEEEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJSS_NS7_ILi64EEEEEENS7_ILi128EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSB_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS19_NS5_INS6_IJS1A_SF_SN_EEENS6_IJS1C_NS7_ILi1024EEES1F_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1Y_S1Y_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SB_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS11_SA_SQ_EEENS6_IJNS6_IJSZ_SB_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SQ_EEEEENS21_IS24_NS5_INS6_IJS25_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSQ_S11_EEENS6_IJSB_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2H_iEENS2T_IS2S_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 1632 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash23copy_rotary_interleavedILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSE_SC_EEESE_SE_EEENS9_IJSH_iNSA_ILi32EEEEEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm32ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi8EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm128ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_SF_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJSB_NS7_ILi16EEEEEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJSS_NS7_ILi64EEEEEENS7_ILi128EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSB_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS19_NS5_INS6_IJS1A_SF_SN_EEENS6_IJS1C_NS7_ILi1024EEES1F_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1Y_S1Y_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SB_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS11_SA_SQ_EEENS6_IJNS6_IJSZ_SB_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SQ_EEEEENS21_IS24_NS5_INS6_IJS25_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSQ_S11_EEENS6_IJSB_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2H_iEENS2T_IS2S_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash23copy_rotary_interleavedILb1ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSE_SC_EEESE_SE_EEENS9_IJSH_iNSA_ILi32EEEEEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash23copy_rotary_interleavedILb1ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSE_SC_EEESE_SE_EEENS9_IJSH_iNSA_ILi32EEEEEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 1648 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash23copy_rotary_interleavedILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSE_SC_EEESE_SE_EEENS9_IJSH_iNSA_ILi32EEEEEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm32ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi8EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm128ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_SF_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJSB_NS7_ILi16EEEEEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJSS_NS7_ILi64EEEEEENS7_ILi128EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSB_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS19_NS5_INS6_IJS1A_SF_SN_EEENS6_IJS1C_NS7_ILi1024EEES1F_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1Y_S1Y_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SB_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS11_SA_SQ_EEENS6_IJNS6_IJSZ_SB_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SQ_EEEEENS21_IS24_NS5_INS6_IJS25_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSQ_S11_EEENS6_IJSB_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2H_iEENS2T_IS2S_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 1648 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash23copy_rotary_interleavedILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSE_SC_EEESE_SE_EEENS9_IJSH_iNSA_ILi32EEEEEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm32ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi8EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm128ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_SF_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJSB_NS7_ILi16EEEEEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJSS_NS7_ILi64EEEEEENS7_ILi128EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSB_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS19_NS5_INS6_IJS1A_SF_SN_EEENS6_IJS1C_NS7_ILi1024EEES1F_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1Y_S1Y_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SB_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS11_SA_SQ_EEENS6_IJNS6_IJSZ_SB_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SQ_EEEEENS21_IS24_NS5_INS6_IJS25_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSQ_S11_EEENS6_IJSB_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2H_iEENS2T_IS2S_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi7ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 29 registers, 2560 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi6ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 29 registers, 1280 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi5ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 29 registers, 640 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi4ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 29 registers, 320 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi3ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 29 registers, 160 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi2ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 29 registers, 80 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 29 registers, 48 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi7ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 2560 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi6ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 1280 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi5ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 640 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi4ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 320 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi3ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 160 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi2ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 80 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 48 bytes smem ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 247 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 241 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 245 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 1648 bytes stack frame, 48 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash23copy_rotary_interleavedILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSE_SC_EEESE_SE_EEENS9_IJSH_iNSA_ILi32EEEEEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm32ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi8EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm128ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_SF_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJSB_NS7_ILi16EEEEEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJSS_NS7_ILi64EEEEEENS7_ILi128EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSB_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS19_NS5_INS6_IJS1A_SF_SN_EEENS6_IJS1C_NS7_ILi1024EEES1F_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1Y_S1Y_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SB_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS11_SA_SQ_EEENS6_IJNS6_IJSZ_SB_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SQ_EEEEENS21_IS24_NS5_INS6_IJS25_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSQ_S11_EEENS6_IJSB_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2H_iEENS2T_IS2S_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 1664 bytes stack frame, 64 bytes spill stores, 88 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash23copy_rotary_interleavedILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSE_SC_EEESE_SE_EEENS9_IJSH_iNSA_ILi32EEEEEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm32ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi8EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm128ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_SF_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJSB_NS7_ILi16EEEEEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJSS_NS7_ILi64EEEEEENS7_ILi128EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSB_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS19_NS5_INS6_IJS1A_SF_SN_EEENS6_IJS1C_NS7_ILi1024EEES1F_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1Y_S1Y_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SB_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS11_SA_SQ_EEENS6_IJNS6_IJSZ_SB_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SQ_EEEEENS21_IS24_NS5_INS6_IJS25_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSQ_S11_EEENS6_IJSB_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2H_iEENS2T_IS2S_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash23copy_rotary_interleavedILb1ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSE_SC_EEESE_SE_EEENS9_IJSH_iNSA_ILi32EEEEEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash23copy_rotary_interleavedILb1ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSE_SC_EEESE_SE_EEENS9_IJSH_iNSA_ILi32EEEEEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 1648 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash23copy_rotary_interleavedILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSE_SC_EEESE_SE_EEENS9_IJSH_iNSA_ILi32EEEEEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm32ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi8EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm128ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_SF_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJSB_NS7_ILi16EEEEEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJSS_NS7_ILi64EEEEEENS7_ILi128EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSB_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS19_NS5_INS6_IJS1A_SF_SN_EEENS6_IJS1C_NS7_ILi1024EEES1F_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1Y_S1Y_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SB_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS11_SA_SQ_EEENS6_IJNS6_IJSZ_SB_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SQ_EEEEENS21_IS24_NS5_INS6_IJS25_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSQ_S11_EEENS6_IJSB_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2H_iEENS2T_IS2S_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 1664 bytes stack frame, 60 bytes spill stores, 84 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash23copy_rotary_interleavedILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSE_SC_EEESE_SE_EEENS9_IJSH_iNSA_ILi32EEEEEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm32ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi8EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm128ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_SF_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJSB_NS7_ILi16EEEEEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJSS_NS7_ILi64EEEEEENS7_ILi128EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSB_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS19_NS5_INS6_IJS1A_SF_SN_EEENS6_IJS1C_NS7_ILi1024EEES1F_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1Y_S1Y_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SB_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS11_SA_SQ_EEENS6_IJNS6_IJSZ_SB_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SQ_EEEEENS21_IS24_NS5_INS6_IJS25_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSQ_S11_EEENS6_IJSB_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2H_iEENS2T_IS2S_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi7ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 2560 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi6ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 1280 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi5ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 31 registers, 640 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi4ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 31 registers, 320 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi3ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 31 registers, 160 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi2ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 31 registers, 80 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 31 registers, 48 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi7ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers, 2560 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi6ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers, 1280 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi5ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 640 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi4ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 320 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi3ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 160 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi2ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 80 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 48 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 243 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 239 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 239 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 1632 bytes stack frame, 20 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash23copy_rotary_interleavedILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSE_SC_EEESE_SE_EEENS9_IJSH_iNSA_ILi32EEEEEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm32ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi8EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm128ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_SF_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJSB_NS7_ILi16EEEEEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJSS_NS7_ILi64EEEEEENS7_ILi128EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSB_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS19_NS5_INS6_IJS1A_SF_SN_EEENS6_IJS1C_NS7_ILi1024EEES1F_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1Y_S1Y_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SB_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS11_SA_SQ_EEENS6_IJNS6_IJSZ_SB_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SQ_EEEEENS21_IS24_NS5_INS6_IJS25_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSQ_S11_EEENS6_IJSB_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2H_iEENS2T_IS2S_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 1648 bytes stack frame, 48 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash23copy_rotary_interleavedILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSE_SC_EEESE_SE_EEENS9_IJSH_iNSA_ILi32EEEEEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm32ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi8EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm128ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_SF_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJSB_NS7_ILi16EEEEEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJSS_NS7_ILi64EEEEEENS7_ILi128EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSB_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS19_NS5_INS6_IJS1A_SF_SN_EEENS6_IJS1C_NS7_ILi1024EEES1F_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1Y_S1Y_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SB_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS11_SA_SQ_EEENS6_IJNS6_IJSZ_SB_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SQ_EEEEENS21_IS24_NS5_INS6_IJS25_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSQ_S11_EEENS6_IJSB_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2H_iEENS2T_IS2S_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash23copy_rotary_interleavedILb1ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSE_SC_EEESE_SE_EEENS9_IJSH_iNSA_ILi32EEEEEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash23copy_rotary_interleavedILb1ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSE_SC_EEESE_SE_EEENS9_IJSH_iNSA_ILi32EEEEEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 1648 bytes stack frame, 48 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash23copy_rotary_interleavedILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSE_SC_EEESE_SE_EEENS9_IJSH_iNSA_ILi32EEEEEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm32ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi8EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm128ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_SF_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJSB_NS7_ILi16EEEEEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJSS_NS7_ILi64EEEEEENS7_ILi128EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSB_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS19_NS5_INS6_IJS1A_SF_SN_EEENS6_IJS1C_NS7_ILi1024EEES1F_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1Y_S1Y_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SB_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS11_SA_SQ_EEENS6_IJNS6_IJSZ_SB_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SQ_EEEEENS21_IS24_NS5_INS6_IJS25_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSQ_S11_EEENS6_IJSB_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2H_iEENS2T_IS2S_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 1632 bytes stack frame, 56 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash23copy_rotary_interleavedILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSE_SC_EEESE_SE_EEENS9_IJSH_iNSA_ILi32EEEEEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm32ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi8EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass10bfloat16_tELm128ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_SF_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJSB_NS7_ILi16EEEEEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJSS_NS7_ILi64EEEEEENS7_ILi128EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSB_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS19_NS5_INS6_IJS1A_SF_SN_EEENS6_IJS1C_NS7_ILi1024EEES1F_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_30SM80_16x8x16_F32BF16BF16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1Y_S1Y_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SB_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS11_SA_SQ_EEENS6_IJNS6_IJSZ_SB_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SQ_EEEEENS21_IS24_NS5_INS6_IJS25_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSQ_S11_EEENS6_IJSB_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2H_iEENS2T_IS2S_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 136 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads [43/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim32_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.23.post1/build/temp.linux-x86_64-cpython-310/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim32_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 --generate-line-info -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 --generate-line-info -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 8704 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 31 registers, 4352 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 25 registers, 2176 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 1088 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 21 registers, 544 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 21 registers, 272 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 21 registers, 144 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 8704 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers, 4352 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 25 registers, 2176 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 1088 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 544 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 272 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 144 bytes smem ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 239 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 237 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 16 bytes stack frame, 12 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 243 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 245 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 249 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 29 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 29 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 251 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 241 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 229 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 37 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 29 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 37 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 29 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 251 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 251 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 229 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] [44/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim256_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.23.post1/build/temp.linux-x86_64-cpython-310/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim256_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 --generate-line-info -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 --generate-line-info -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi7ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 29 registers, 2560 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi6ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 29 registers, 1280 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi5ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 29 registers, 640 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi4ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 29 registers, 320 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi3ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 29 registers, 160 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi2ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 29 registers, 80 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 29 registers, 48 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi7ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 2560 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi6ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 1280 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi5ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 640 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi4ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 320 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi3ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 160 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi2ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 80 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 48 bytes smem ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 247 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 241 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 245 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 245 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 1616 bytes stack frame, 48 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm32ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi8EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm128ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_SF_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJSB_NS7_ILi16EEEEEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJSS_NS7_ILi64EEEEEENS7_ILi128EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSB_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS19_NS5_INS6_IJS1A_SF_SN_EEENS6_IJS1C_NS7_ILi1024EEES1F_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1Y_S1Y_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SB_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS11_SA_SQ_EEENS6_IJNS6_IJSZ_SB_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SQ_EEEEENS21_IS24_NS5_INS6_IJS25_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSQ_S11_EEENS6_IJSB_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2H_iEENS2T_IS2S_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 1616 bytes stack frame, 48 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm32ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi8EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm128ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_SF_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJSB_NS7_ILi16EEEEEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJSS_NS7_ILi64EEEEEENS7_ILi128EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSB_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS19_NS5_INS6_IJS1A_SF_SN_EEENS6_IJS1C_NS7_ILi1024EEES1F_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1Y_S1Y_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SB_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS11_SA_SQ_EEENS6_IJNS6_IJSZ_SB_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SQ_EEEEENS21_IS24_NS5_INS6_IJS25_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSQ_S11_EEENS6_IJSB_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2H_iEENS2T_IS2S_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 1616 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm32ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi8EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm128ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_SF_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJSB_NS7_ILi16EEEEEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJSS_NS7_ILi64EEEEEENS7_ILi128EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSB_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS19_NS5_INS6_IJS1A_SF_SN_EEENS6_IJS1C_NS7_ILi1024EEES1F_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1Y_S1Y_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SB_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS11_SA_SQ_EEENS6_IJNS6_IJSZ_SB_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SQ_EEEEENS21_IS24_NS5_INS6_IJS25_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSQ_S11_EEENS6_IJSB_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2H_iEENS2T_IS2S_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 1616 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm32ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi8EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm128ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_SF_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJSB_NS7_ILi16EEEEEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJSS_NS7_ILi64EEEEEENS7_ILi128EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSB_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS19_NS5_INS6_IJS1A_SF_SN_EEENS6_IJS1C_NS7_ILi1024EEES1F_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1Y_S1Y_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SB_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS11_SA_SQ_EEENS6_IJNS6_IJSZ_SB_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SQ_EEEEENS21_IS24_NS5_INS6_IJS25_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSQ_S11_EEENS6_IJSB_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2H_iEENS2T_IS2S_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi7ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 2560 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi6ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 1280 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi5ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 31 registers, 640 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi4ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 31 registers, 320 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi3ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 31 registers, 160 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi2ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 31 registers, 80 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 31 registers, 48 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi7ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers, 2560 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi6ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers, 1280 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi5ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 640 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi4ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 320 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi3ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 160 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi2ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 80 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 48 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 243 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 239 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 239 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 241 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 1600 bytes stack frame, 20 bytes spill stores, 24 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm32ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi8EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm128ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_SF_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJSB_NS7_ILi16EEEEEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJSS_NS7_ILi64EEEEEENS7_ILi128EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSB_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS19_NS5_INS6_IJS1A_SF_SN_EEENS6_IJS1C_NS7_ILi1024EEES1F_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1Y_S1Y_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SB_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS11_SA_SQ_EEENS6_IJNS6_IJSZ_SB_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SQ_EEEEENS21_IS24_NS5_INS6_IJS25_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSQ_S11_EEENS6_IJSB_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2H_iEENS2T_IS2S_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 1616 bytes stack frame, 48 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm32ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi8EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm128ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_SF_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJSB_NS7_ILi16EEEEEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJSS_NS7_ILi64EEEEEENS7_ILi128EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSB_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS19_NS5_INS6_IJS1A_SF_SN_EEENS6_IJS1C_NS7_ILi1024EEES1F_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1Y_S1Y_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SB_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS11_SA_SQ_EEENS6_IJNS6_IJSZ_SB_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SQ_EEEEENS21_IS24_NS5_INS6_IJS25_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSQ_S11_EEENS6_IJSB_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2H_iEENS2T_IS2S_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 1616 bytes stack frame, 48 bytes spill stores, 44 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm32ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi8EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm128ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_SF_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJSB_NS7_ILi16EEEEEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJSS_NS7_ILi64EEEEEENS7_ILi128EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSB_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS19_NS5_INS6_IJS1A_SF_SN_EEENS6_IJS1C_NS7_ILi1024EEES1F_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1Y_S1Y_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SB_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS11_SA_SQ_EEENS6_IJNS6_IJSZ_SB_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SQ_EEEEENS21_IS24_NS5_INS6_IJS25_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSQ_S11_EEENS6_IJSB_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2H_iEENS2T_IS2S_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 1600 bytes stack frame, 56 bytes spill stores, 56 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm32ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi8EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm128ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_SF_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJSB_NS7_ILi16EEEEEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJSS_NS7_ILi64EEEEEENS7_ILi128EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSB_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS19_NS5_INS6_IJS1A_SF_SN_EEENS6_IJS1C_NS7_ILi1024EEES1F_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1Y_S1Y_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SB_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS11_SA_SQ_EEENS6_IJNS6_IJSZ_SB_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SQ_EEEEENS21_IS24_NS5_INS6_IJS25_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSQ_S11_EEENS6_IJSB_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2H_iEENS2T_IS2S_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi7ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 2560 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi6ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 1280 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi5ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 31 registers, 640 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi4ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 31 registers, 320 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi3ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 31 registers, 160 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi2ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 31 registers, 80 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 31 registers, 48 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi7ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 34 registers, 2560 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi6ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 34 registers, 1280 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi5ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 34 registers, 640 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi4ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 33 registers, 320 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi3ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 33 registers, 160 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi2ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 33 registers, 80 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELi4ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 33 registers, 48 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 241 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 229 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 239 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 231 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 1600 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm32ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi8EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm128ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_SF_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJSB_NS7_ILi16EEEEEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJSS_NS7_ILi64EEEEEENS7_ILi128EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSB_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS19_NS5_INS6_IJS1A_SF_SN_EEENS6_IJS1C_NS7_ILi1024EEES1F_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1Y_S1Y_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SB_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS11_SA_SQ_EEENS6_IJNS6_IJSZ_SB_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SQ_EEEEENS21_IS24_NS5_INS6_IJS25_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSQ_S11_EEENS6_IJSB_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2H_iEENS2T_IS2S_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 1600 bytes stack frame, 28 bytes spill stores, 32 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm32ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi8EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm128ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_SF_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJSB_NS7_ILi16EEEEEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJSS_NS7_ILi64EEEEEENS7_ILi128EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSB_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS19_NS5_INS6_IJS1A_SF_SN_EEENS6_IJS1C_NS7_ILi1024EEES1F_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1Y_S1Y_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SB_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS11_SA_SQ_EEENS6_IJNS6_IJSZ_SB_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SQ_EEEEENS21_IS24_NS5_INS6_IJS25_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSQ_S11_EEENS6_IJSB_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2H_iEENS2T_IS2S_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 1616 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm32ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi8EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm128ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_SF_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJSB_NS7_ILi16EEEEEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJSS_NS7_ILi64EEEEEENS7_ILi128EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSB_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS19_NS5_INS6_IJS1A_SF_SN_EEENS6_IJS1C_NS7_ILi1024EEES1F_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1Y_S1Y_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SB_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS11_SA_SQ_EEENS6_IJNS6_IJSZ_SB_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SQ_EEEEENS21_IS24_NS5_INS6_IJS25_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSQ_S11_EEENS6_IJSB_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2H_iEENS2T_IS2S_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 1616 bytes stack frame, 44 bytes spill stores, 40 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb1EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEENS2_INS1_8smem_ptrIS5_EEEENS8_ISF_NS9_IJSH_NSA_ILi1024EEENSA_ILi4096EEEEEEEES7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENS12_INSA_ILi16EEELi0EEENS12_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS1A_IT3_T4_EERKNS1A_IT5_T6_EES1O_RKNS1A_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Function properties for _ZN5flash4gemmILb0ELb0EN4cute6TensorINS1_13array_alignedIfLm32ELm16EEENS1_6LayoutINS1_5tupleIJNS6_IJNS1_1CILi2EEES8_EEENS7_ILi1EEENS7_ILi8EEEEEENS6_IJNS6_IJSA_S8_EEENS7_ILi0EEENS7_ILi4EEEEEEEEEENS2_INS3_IN7cutlass6half_tELm128ELm16EEENS5_INS6_IJNS6_IJS8_S8_S8_EEESA_NS6_IJS9_SF_EEEEEENS6_IJNS6_IJSA_S8_SF_EEESE_NS6_IJNS6_IJSB_NS7_ILi16EEEEEENS7_ILi32EEEEEEEEEEEEENS2_INS3_ISK_Lm512ELm16EEENS5_INS6_IJS9_SB_SN_EEENS6_IJSD_SF_NS6_IJNS6_IJSS_NS7_ILi64EEEEEENS7_ILi128EEEEEEEEEEEEENS2_INS1_10ViewEngineINS1_8smem_ptrISK_EEEENS5_INS6_IJNS6_IJSB_SA_EEESA_SN_EEENS6_IJNS6_IJSA_SE_EEESE_NS6_IJNS6_IJiiEEENS7_ILi4096EEEEEEEEEEEEENS2_IS19_NS5_INS6_IJS1A_SF_SN_EEENS6_IJS1C_NS7_ILi1024EEES1F_EEEEEEENS1_8TiledMMAINS1_8MMA_AtomIJNS1_28SM80_16x8x16_F32F16F16F32_TNEEEENS5_INS6_IJSF_SA_SA_EEENS6_IJSA_SE_SE_EEEEENS5_INS6_IJSA_S8_SA_EEENS6_IJSE_SA_SE_EEEEENS6_IJNS1_10UnderscoreES1Y_S1Y_EEEEENS1_9TiledCopyINS1_9Copy_AtomIJNS1_17SM75_U32x4_LDSM_NESK_EEENS5_INS6_IJNS6_IJSF_SB_SF_EEENS6_IJSM_NS6_IJSA_SA_EEEEEEEEENS6_IJNS6_IJS11_SA_SQ_EEENS6_IJNS6_IJSZ_SB_NS7_ILi512EEEEEENS6_IJSE_SE_EEEEEEEEEEENS6_IJSZ_SQ_EEEEENS21_IS24_NS5_INS6_IJS25_NS6_IJS9_NS6_IJS8_SA_EEEEEEEEENS6_IJNS6_IJSS_SA_SE_EEENS6_IJNS6_IJSQ_S11_EEENS6_IJSB_SE_EEEEEEEEEEENS6_IJSQ_SQ_EEEEENS1_7ThrCopyIS2H_iEENS2T_IS2S_iEEEEvRT1_RT2_RT3_RKT4_RKT5_T6_T7_T8_T9_T10_ 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi256ELi64ELi64ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi256ELi64ELi64ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb1ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass6half_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEESE_EEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi64EEEEEEEES7_SK_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SE_EEENS9_IJNS9_IJSG_SC_EEEiSI_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISF_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESG_EEENSV_INSA_ILi16EEELi0EEENSV_ISI_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads [45/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim32_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.23.post1/build/temp.linux-x86_64-cpython-310/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim32_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 --generate-line-info -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 --generate-line-info -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 29 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 29 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 26 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 251 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 241 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 229 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 37 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 29 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 24 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 37 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 29 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 28 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 251 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 246 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 251 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 229 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers, 688 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 8704 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 31 registers, 4352 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 25 registers, 2176 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 1088 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 21 registers, 544 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 21 registers, 272 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 21 registers, 144 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 32 registers, 8704 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 30 registers, 4352 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 25 registers, 2176 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 1088 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 544 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 272 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 144 bytes smem ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 239 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 237 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 16 bytes stack frame, 12 bytes spill stores, 12 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 253 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 8 bytes stack frame, 4 bytes spill stores, 4 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 16 bytes stack frame, 12 bytes spill stores, 16 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 242 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 243 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 245 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 254 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 249 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi32ELi64ELi256ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi32ELi64ELi256ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 255 registers [46/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim96_bf16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.23.post1/build/temp.linux-x86_64-cpython-310/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim96_bf16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 --generate-line-info -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 --generate-line-info -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 39 registers, 8704 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 4352 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 2176 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 1088 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 37 registers, 544 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 37 registers, 272 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 37 registers, 144 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 39 registers, 8704 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 4352 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 2176 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 1088 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 544 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 272 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 144 bytes smem ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 204 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 208 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 171 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 172 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 171 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 172 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 180 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 196 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 198 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 167 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 228 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 220 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 228 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 212 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 227 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 208 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 223 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 200 registers ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 204 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 198 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 237 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 41 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 41 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 41 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 41 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 39 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 39 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 39 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 36 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 204 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 210 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 173 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 176 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 176 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 189 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 206 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 209 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 198 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 210 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 210 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 218 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 214 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 220 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 227 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 219 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 212 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 207 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 228 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 688 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 39 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 39 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 39 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 39 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 204 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 210 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 173 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 176 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 176 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 189 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 185 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 213 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 221 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 210 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 210 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 218 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 214 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 220 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 227 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 221 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 212 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 72 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 237 registers, 688 bytes cmem[0] ptxas info : Function properties for _ZN5flash22copy_rotary_contiguousILb0ELb0EN4cute10ViewEngineINS1_8gmem_ptrIN7cutlass10bfloat16_tEEEEENS1_6LayoutINS1_5tupleIJNS9_IJNS1_1CILi8EEENSA_ILi1EEEEEENSA_ILi4EEENSA_ILi3EEEEEENS9_IJNS9_IJSC_NSA_ILi0EEEEEEjNSA_ILi32EEEEEEEES7_SL_S7_NS8_INS9_IJNS9_IJSC_SB_EEESE_SF_EEENS9_IJNS9_IJSH_SC_EEEiSJ_EEEEENS2_INS1_23ArithmeticTupleIteratorINS1_15ArithmeticTupleIJiiEEEEEEENS8_ISG_NS9_IJNS9_IJNS1_11ScaledBasisISC_Li1EEESH_EEENSW_ISJ_Li0EEENSW_ISJ_Li1EEEEEEEEEEvRKNS1_6TensorIT1_T2_EERNS13_IT3_T4_EERKNS13_IT5_T6_EES1H_RKNS13_IT7_T8_EEiiii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass10bfloat16_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 228 registers, 688 bytes cmem[0] [47/49] /usr/local/cuda-12.1/bin/nvcc -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src -I/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/cutlass/include -I/usr/lib64/python3.10/site-packages/torch/include -I/usr/lib64/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/lib64/python3.10/site-packages/torch/include/TH -I/usr/lib64/python3.10/site-packages/torch/include/THC -I/usr/local/cuda-12.1/include -I/usr/include/python3.10 -c -c /builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim96_fp16_sm80.cu -o /builddir/build/BUILD/xformers-0.0.23.post1/build/temp.linux-x86_64-cpython-310/builddir/build/BUILD/xformers-0.0.23.post1/third_party/flash-attention/csrc/flash_attn/src/flash_fwd_split_hdim96_fp16_sm80.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -DHAS_PYTORCH --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --extended-lambda -D_ENABLE_EXTENDED_ALIGNED_STORAGE -std=c++17 --generate-line-info -DNDEBUG --compiler-options -fPIC -Wno-deprecated-gpu-targets -allow-unsupported-compiler --fatbin-options -compress-all --threads 4 --ptxas-options=-v -O3 -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --ptxas-options=-v -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 --generate-line-info -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1017"' -DTORCH_EXTENSION_NAME=_C_flashattention -D_GLIBCXX_USE_CXX11_ABI=1 -ccbin gcc /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=false, =0]" at line 77 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h Remark: The warnings can be suppressed with "-diag-suppress " /usr/lib64/python3.10/site-packages/torch/include/c10/util/irange.h(54): warning #186-D: pointless comparison of unsigned integer with zero return other.value < 0 || value == other.value; ^ detected during: instantiation of "__nv_bool c10::detail::integer_iterator>::operator==(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 61 instantiation of "__nv_bool c10::detail::integer_iterator>::operator!=(const c10::detail::integer_iterator> &) const [with I=size_t, one_sided=true, =0]" at line 2327 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h instantiation of "__nv_bool c10::TensorImpl::SetDimsTemplate(c10::ArrayRef) [with T=int64_t, =void]" at line 2337 of /usr/lib64/python3.10/site-packages/torch/include/c10/core/TensorImpl.h ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 41 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 41 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 41 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 41 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 39 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 39 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 39 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 36 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 204 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 210 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 173 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 176 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 176 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 189 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 193 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 209 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 205 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 210 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 210 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 218 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 214 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 220 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 227 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 248 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 227 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 212 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 214 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 236 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 228 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_80' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 226 registers, 688 bytes cmem[0] ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 39 registers, 8704 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 4352 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 2176 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 1088 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 37 registers, 544 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 37 registers, 272 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 37 registers, 144 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 39 registers, 8704 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 4352 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 2176 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 1088 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 544 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 272 bytes smem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 144 bytes smem ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 204 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 208 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 171 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 172 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 171 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 172 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 180 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 202 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 198 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 198 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 167 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 228 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 220 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 228 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 212 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 227 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 212 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 223 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 215 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 204 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 198 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 244 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 237 registers ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_90' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 252 registers ptxas info : 2 bytes gmem ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi7ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 39 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi6ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 39 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi5ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 39 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi4ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 39 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi3ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi2ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 38 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi7ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 8704 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi6ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 4352 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi5ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 2176 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi4ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 1088 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi3ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 544 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi2ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 272 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z32flash_fwd_splitkv_combine_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELi16ELi1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 40 registers, 144 bytes smem, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 204 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 210 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 168 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 173 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 176 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 176 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 189 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 187 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 213 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 181 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 210 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 210 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 224 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 218 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 214 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 220 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb0ELb1ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 227 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 250 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 235 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 212 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb0ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 237 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb1ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 222 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 238 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb0ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 230 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb0EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 240 registers, 688 bytes cmem[0] ptxas info : Compiling entry function '_Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params' for 'sm_86' ptxas info : Function properties for _Z24flash_fwd_splitkv_kernelI23Flash_fwd_kernel_traitsILi96ELi64ELi128ELi4ELb0ELb0EN7cutlass6half_tE19Flash_kernel_traitsILi96ELi64ELi128ELi4ES2_EELb1ELb0ELb0ELb1ELb1ELb1EEv16Flash_fwd_params 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 228 registers, 688 bytes cmem[0]