Following the steps for https://pytorch.org/docs/stable/torch.compiler_inductor_profiling.html#torchinductor-gpu-profiling =========================================== On A10g g3.xlarge AWS instance ========================================= $> TORCHINDUCTOR_UNIQUE_KERNEL_NAMES=1 TORCHINDUCTOR_BENCHMARK_KERNEL=1 python -u benchmarks/dynamo/timm_models.py --backend inductor --amp --performance --dashboard --only mixnet_l --disable-cudagraphs --training /opt/conda/envs/nllm3/lib/python3.10/site-packages/transformers/utils/generic.py:441: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead. _torch_pytree._register_pytree_node( loading model: 0it [00:03, ?it/s] cuda train mixnet_l Compiled module path: /tmp/torchinductor_ubuntu/5s/c5stj2jxenm5drqkam5psa2ziynzsqlkiib7jetadkyizgwntzck.py Compiled module path: /tmp/torchinductor_ubuntu/4n/c4nztrt3dq5vcvrhdh3lpv5zhuu4u5wpsi5lapcghivvvpyl3hg2.py running benchmark: 100%|███████████████████████████████| 30/30 [00:15<00:00, 1.88it/s] 1.120x =================== Profile for the generated forward graph =========================== $> cp /tmp/torchinductor_ubuntu/5s/c5stj2jxenm5drqkam5psa2ziynzsqlkiib7jetadkyizgwntzck.py fwd.py $> python fwd.py -p /opt/conda/envs/nllm3/lib/python3.10/site-packages/transformers/utils/generic.py:441: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead. _torch_pytree._register_pytree_node( 0.071641 STAGE:2024-01-27 23:11:15 31897:31897 ActivityProfilerController.cpp:314] Completed Stage: Warm Up 0.072743 STAGE:2024-01-27 23:11:23 31897:31897 ActivityProfilerController.cpp:320] Completed Stage: Collection STAGE:2024-01-27 23:11:23 31897:31897 ActivityProfilerController.cpp:324] Completed Stage: Post Processing Profiling result for a compiled module of benchmark mixnet_l: Chrome trace for the profile is written to /tmp/compiled_module_profile.json ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -------------------------------------------------------------------------------- Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls Input Shapes ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -------------------------------------------------------------------------------- void cudnn::cnn::conv2d_grouped_direct_kernel(float, cudnnTensorStru 1.95538 15.0 27.29% ampere_fp16_s1688gemm_fp16_128x128_ldg8_f2f_stages_32x1_tn 1.62528 3.0 22.69% void conv2d_c1_k1_nhwc_kernel_specialized<__half, __half, __half, float, float, 7, 2, 8, 2, true>(float, cudnnTensorStru 0.84468 4.0 11.79% void conv2d_c1_k1_nhwc_kernel_specialized<__half, __half, __half, float, float, 5, 2, 8, 2, true>(float, cudnnTensorStru 0.84219 4.0 11.76% void conv2d_c1_k1_nhwc_kernel_specialized<__half, __half, __half, float, float, 3, 2, 8, 2, true>(float, cudnnTensorStru 0.79363 4.0 11.08% void conv2d_c1_k1_nhwc_kernel_specialized<__half, __half, __half, float, float, 5, 1, 8, 3, true>(float, cudnnTensorStru 0.79288 12.0 11.07% sm86_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x128x32_stage3_warpsize2x2x1_g1_tensor16x8x16_e 0.71032 6.0 9.91% void cutlass::Kernel(cutlass_80_wmma_tensorop_f16_s16 0.66651 8.0 9.30% sm80_xmma_fprop_implicit_gemm_indexed_wo_smem_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x32x16_stage1_warpsize4x1x1_g1_ 0.60876 6.0 8.50% sm80_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x32x64_stage5_warpsize2x2x1_g1_tensor16x8x16_exe 0.55655 9.0 7.77% void conv2d_c1_k1_nhwc_kernel_specialized<__half, __half, __half, float, float, 7, 1, 8, 3, true>(float, cudnnTensorStru 0.53954 9.0 7.53% ampere_fp16_s1688gemm_fp16_256x64_ldg8_f2f_tn 0.4769 1.0 6.66% void cutlass_cudnn::Kernel(cutlass_tensorop_f16_ 0.44553 7.0 6.22% void cutlass::Kernel(cutlass_80_wmma_tensorop_f16_s16 0.4233 1.0 5.91% void cutlass::Kernel(cutlass_80_tensorop_f16_s16816gemm_f1 0.35166 6.0 4.91% sm80_xmma_gemm_f16f16_f16f32_f32_tn_n_tilesize96x128x32_stage4_warpsize2x2x1_tensor16x8x16_kernel 0.32669 6.0 4.56% sm80_xmma_fprop_implicit_gemm_indexed_wo_smem_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x32x64_stage1_warpsize4x1x1_g1_ 0.32389 6.0 4.52% void cutlass::Kernel(cutlass_80_wmma_tensorop_f16_s16 0.31025 10.0 4.33% sm80_xmma_fprop_implicit_gemm_indexed_wo_smem_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize128x32x16_stage1_warpsize4x1x1_g1_ 0.30084 1.0 4.20% sm86_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage3_warpsize2x2x1_g1_tensor16x8x16_exe 0.26155 3.0 3.65% sm86_xmma_fprop_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x128x32_stage4_warpsize2x2x1_g1_tensor16x8x16_ex 0.23531 6.0 3.28% void nhwcAddPaddingKernel<__half, __half, float, true, (cudnnKernelDataType_t)0>(int, int, int, int, int, int, int, int, 0.22214 12.0 3.10% void tensorTransformGeneric<__half, __half, float, true, false, false, (cudnnKernelDataType_t)0>(cudnnTensorTransformStr 0.18228 6.0 2.54% Memset (Device) 0.05489 19.0 0.77% void cutlass::Kernel(cutlass_80_wmma_tensorop_f16_s1 0.051 3.0 0.71% void cutlass::Kernel(cutlass_80_wmma_tensorop_f16_s1 0.04299 8.0 0.60% void cudnn::ops::nchwToNhwcKernel<__half, __half, float, false, true, (cudnnKernelDataType_t)0>(cudnn::ops::nchw2nhwc_pa 0.024 6.0 0.34% void cutlass::Kernel(cutlass_80_tensorop_f16_s16816gem 0.02013 1.0 0.28% sm80_xmma_gemm_f16f16_f16f32_f32_tn_n_tilesize32x32x64_stage6_warpsize2x2x1_tensor16x8x16_kernel 0.01801 4.0 0.25% _ZN2at6native54_GLOBAL__N__d8ceb000_21_DistributionNormal_cu_0c5b6e8543distribution_elementwise_grid_stride_kernelIfLi4E 0.0112 4.22 0.16% void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __ha 0.0081 4.0 0.11% void cutlass::Kernel(cutlass_80_wmma_tensorop_f16_s16 0.00436 1.0 0.06% void cutlass::Kernel(cutlass_80_wmma_tensorop_f16_s1 0.004 1.0 0.06% void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at:: 0.00116 0.58 0.02% Total 28.2738 394.66% Percent of time when GPU is busy: 998.12% Total wall time 7.164 ms Output for tabulate: mixnet_l, 482.63%, 115.20%, 5.63%, 0.00%, 394.66%, 998.12%, 7.164ms ======================================================================================== =================== Profile for the generated backward graph =========================== ======================================================================================== $> cp /tmp/torchinductor_ubuntu/4n/c4nztrt3dq5vcvrhdh3lpv5zhuu4u5wpsi5lapcghivvvpyl3hg2.py bwd.py $> python bwd.py -p /opt/conda/envs/nllm3/lib/python3.10/site-packages/transformers/utils/generic.py:441: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead. _torch_pytree._register_pytree_node( 0.174035 STAGE:2024-01-28 00:05:01 33157:33157 ActivityProfilerController.cpp:314] Completed Stage: Warm Up 0.176385 STAGE:2024-01-28 00:05:19 33157:33157 ActivityProfilerController.cpp:320] Completed Stage: Collection STAGE:2024-01-28 00:05:20 33157:33157 ActivityProfilerController.cpp:324] Completed Stage: Post Processing Profiling result for a compiled module of benchmark mixnet_l: Chrome trace for the profile is written to /tmp/compiled_module_profile.json ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -------------------------------------------------------------------------------- Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls Input Shapes ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -------------------------------------------------------------------------------- sm80_xmma_wgrad_implicit_gemm_indexed_f16f16_f16f32_... 0.00% 0.000us 0.00% 0.000us 0.000us 1.997s 11.46% 1.997s 1.816ms 1100 [] void dgrad2d_c1_k1_nhwc_kernel_specialized_window<__... 0.00% 0.000us 0.00% 0.000us 0.000us 1.965s 11.28% 1.965s 4.912ms 400 [] void at::native::elementwise_kernel<128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 1.941s 11.14% 1.941s 76.416us 25400 [] aten::convolution_backward 0.04% 5.854ms 0.07% 11.150ms 111.500us 1.485s 8.52% 1.599s 15.988ms 100 [[128, 64, 56, 56], [128, 64, 112, 112], [64, 1, 7, 7], [], [], [], [], [], [], aten::convolution_backward 0.06% 10.319ms 0.12% 20.414ms 204.140us 1.159s 6.65% 1.259s 12.587ms 100 [[128, 60, 28, 28], [128, 60, 56, 56], [60, 1, 9, 9], [], [], [], [], [], [], [] aten::convolution_backward 0.19% 31.019ms 0.44% 73.044ms 243.480us 976.703ms 5.61% 1.270s 4.232ms 300 [[128, 156, 14, 14], [128, 156, 14, 14], [156, 1, 9, 9], [], [], [], [], [], [], void dgrad2d_c1_k1_nhwc_kernel<__half, float, float,... 0.00% 0.000us 0.00% 0.000us 0.000us 969.776ms 5.57% 969.776ms 881.615us 1100 [] cudaLaunchKernel 2.27% 376.777ms 2.27% 376.777ms 5.499us 938.073ms 5.38% 946.283ms 13.812us 68513 [] aten::convolution_backward 0.19% 31.163ms 0.43% 71.934ms 239.780us 776.417ms 4.46% 1.051s 3.502ms 300 [[128, 120, 14, 14], [128, 120, 14, 14], [120, 1, 9, 9], [], [], [], [], [], [], void dgrad2d_c1_k1_nhwc_kernel_specialized_window<__... 0.00% 0.000us 0.00% 0.000us 0.000us 713.334ms 4.09% 713.334ms 1.783ms 400 [] ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -------------------------------------------------------------------------------- Self CPU time total: 16.590s Self CUDA time total: 17.423s == triton_pointwise category kernels == Kernel Self CUDA TIME (ms) Count Percent ------------------------------------------------------------------------------------------------------------------------ --------------------- ------- --------- triton_poi_fused__native_batch_norm_legit_functional_cat_native_batch_norm_backward_threshold_backward_231_0d1d2d3d4d5d6 5.69146 1.0 32.70% triton_poi_fused__native_batch_norm_legit_functional_cat_mul_native_batch_norm_backward_166_0d1d2d3d4d5d6d7d8d9d10de 2.06931 3.0 11.89% triton_poi_fused_convolution_backward_232_0d1d2d3d4de 1.92557 1.0 11.06% triton_poi_fused_convolution_backward_234_0d1d2d3d4de 1.9197 1.0 11.03% triton_poi_fused_add_cat_div_fill_mul_native_batch_norm_backward_sigmoid_sub_160_0d1d2d3d4d5d6de 1.6575 3.0 9.52% triton_poi_fused_cat_mul_native_batch_norm_backward_195_0d1d2d3d4d5d6de 1.60189 1.0 9.20% triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_162_0d1d2d3d4d5d6d7d8de 1.58835 3.0 9.13% triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_native_batch_norm_backward_199_0d1d2d3d4d5d6d7 1.51959 1.0 8.73% triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_115_0d1d2d3d4d5d6d7d8de 1.46566 6.0 8.42% triton_poi_fused__native_batch_norm_legit_functional_cat_native_batch_norm_backward_threshold_backward_221_0d1d2d3d4d5d6 1.42452 1.0 8.19% triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_75_0d1d2d3d4d5d6d7d8de 1.13242 6.0 6.51% triton_poi_fused__native_batch_norm_legit_functional_add_cat_convolution_backward_native_batch_norm_backward_threshold_b 0.98655 1.0 5.67% triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_22_0d1d2d3d4d5d6d7d8de 0.92679 6.0 5.33% triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_native_batch_norm_backward_threshold_backward_ 0.80569 1.0 4.63% triton_poi_fused_cat_mul_native_batch_norm_backward_120_0d1d2d3d4d5d6de 0.77278 3.0 4.44% triton_poi_fused_add_cat_div_fill_mul_native_batch_norm_backward_sigmoid_sub_113_0d1d2d3d4d5d6de 0.76716 3.0 4.41% triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_211_0d1d2d3d4d5d6d7d8 0.75476 1.0 4.34% triton_poi_fused__native_batch_norm_legit_functional_cat_convolution_backward_native_batch_norm_backward_threshold_backw 0.67957 1.0 3.90% triton_poi_fused_convolution_backward_167_0d1d2de 0.64143 3.0 3.69% triton_poi_fused__native_batch_norm_legit_functional_cat_convolution_backward_native_batch_norm_backward_238_0d1d2d3d4d5 0.62247 1.0 3.58% triton_poi_fused_convolution_backward_169_0d1d2de 0.61866 3.0 3.55% triton_poi_fused_add_cat_div_fill_mul_native_batch_norm_backward_sigmoid_sub_71_0d1d2d3d4d5d6de 0.58972 3.0 3.39% triton_poi_fused_cat_mul_native_batch_norm_backward_80_0d1d2d3d4d5d6de 0.58952 3.0 3.39% triton_poi_fused__native_batch_norm_legit_functional_cat_convolution_backward_mul_native_batch_norm_backward_148_0d1d2d3 0.53351 1.0 3.07% triton_poi_fused_add_cat_div_fill_mul_native_batch_norm_backward_sigmoid_sub_18_0d1d2d3d4d5d6de 0.4981 3.0 2.86% triton_poi_fused__native_batch_norm_legit_functional_add_div_fill_mul_native_batch_norm_backward_sigmoid_sub_186_0d1d2d3 0.49117 1.0 2.82% triton_poi_fused_cat_mul_native_batch_norm_backward_27_0d1d2d3d4d5d6de 0.48766 3.0 2.80% triton_poi_fused_cat_mul_native_batch_norm_backward_56_0d1d2d3d4d5d6de 0.39636 1.0 2.28% triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_native_batch_norm_backward_60_0d1d2d3d4d5d6d7d 0.37881 1.0 2.18% triton_poi_fused__native_batch_norm_legit_functional_add_cat_native_batch_norm_backward_214_0d1d2d3d4d5d6d7d8d9d10de 0.325 1.0 1.87% triton_poi_fused_convolution_backward_222_0d1d2de 0.32196 1.0 1.85% triton_poi_fused_convolution_backward_224_0d1d2de 0.31474 1.0 1.81% triton_poi_fused_convolution_backward_226_0d1d2de 0.31472 1.0 1.81% triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_div_fill_mul_native_batch_norm_backward_si 0.24629 1.0 1.42% triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_mul_native_batch_norm_backward_101_0d1d2d3d4d5 0.2415 1.0 1.39% triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_204_0d1d2d3d4d5d6d7d8de 0.19124 1.0 1.10% triton_poi_fused__native_batch_norm_legit_functional_add_div_fill_mul_native_batch_norm_backward_sigmoid_sub_138_0d1d2d3 0.17089 1.0 0.98% triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_153_0d1d2d3d4d5d6d7d8de 0.13117 2.0 0.75% triton_poi_fused__native_batch_norm_legit_functional_add_div_fill_mul_native_batch_norm_backward_sigmoid_sub_47_0d1d2d3d 0.12145 1.0 0.70% triton_poi_fused_convolution_backward_215_0d1d2de 0.11752 1.0 0.68% triton_poi_fused__native_batch_norm_legit_functional_add_cat_native_batch_norm_backward_171_0d1d2d3d4d5d6d7d8d9d10de 0.11215 1.0 0.64% triton_poi_fused_convolution_backward_217_0d1d2de 0.10973 1.0 0.63% triton_poi_fused__native_batch_norm_legit_functional_convolution_backward_div_native_batch_norm_backward_threshold_backw 0.10011 1.0 0.58% triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_65_0d1d2d3d4d5d6d7d8de 0.0975 2.0 0.56% triton_poi_fused_convolution_backward_189_0d1d2d3d4de 0.08742 1.0 0.50% triton_poi_fused_convolution_backward_187_0d1d2d3d4de 0.08526 1.0 0.49% triton_poi_fused_add_cat_174_0d1d2d3d4d5de 0.08472 1.0 0.49% triton_poi_fused__native_batch_norm_legit_functional_add_cat_convolution_backward_native_batch_norm_backward_175_0d1d2d3 0.08458 1.0 0.49% triton_poi_fused_convolution_backward_191_0d1d2d3d4de 0.08078 1.0 0.46% triton_poi_fused__native_batch_norm_legit_functional_add_cat_native_batch_norm_backward_83_0d1d2d3d4d5d6d7d8d9d10de 0.08024 1.0 0.46% triton_poi_fused_convolution_backward_193_0d1d2d3d4de 0.07845 1.0 0.45% triton_poi_fused_add_cat_86_0d1d2d3d4d5de 0.06273 1.0 0.36% triton_poi_fused__native_batch_norm_legit_functional_add_cat_convolution_backward_native_batch_norm_backward_87_0d1d2d3d 0.06193 1.0 0.36% triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_106_0d1d2d3d4d5d6d7d8de 0.06157 2.0 0.35% triton_poi_fused__native_batch_norm_legit_functional_add_cat_native_batch_norm_backward_123_0d1d2d3d4d5d6d7d8d9d10de 0.05024 1.0 0.29% triton_poi_fused_convolution_backward_172_0d1d2de 0.04291 1.0 0.25% triton_poi_fused__native_batch_norm_legit_functional_add_convolution_backward_native_batch_norm_backward_36_0d1d2d3d4d5d 0.04175 1.0 0.24% triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_32_0d1d2d3d4d5d6d7d8d9d10de 0.04099 1.0 0.24% triton_poi_fused__native_batch_norm_legit_functional_add_cat_convolution_backward_native_batch_norm_backward_127_0d1d2d3 0.03901 1.0 0.22% triton_poi_fused_add_cat_126_0d1d2d3d4d5de 0.03888 1.0 0.22% triton_poi_fused_convolution_backward_139_0d1d2d3d4de 0.03883 1.0 0.22% triton_poi_fused_convolution_backward_173_0d1d2de 0.03483 1.0 0.20% triton_poi_fused_convolution_backward_141_0d1d2d3d4de 0.03399 1.0 0.20% triton_poi_fused_convolution_backward_143_0d1d2d3d4de 0.03162 1.0 0.18% triton_poi_fused_convolution_backward_84_0d1d2de 0.02898 1.0 0.17% triton_poi_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_30_0d1d2d3d4d5d6d7d8d9de 0.02803 1.0 0.16% triton_poi_fused_convolution_backward_85_0d1d2de 0.02426 1.0 0.14% triton_poi_fused_convolution_backward_50_0d1d2d3d4de 0.02112 1.0 0.12% triton_poi_fused_convolution_backward_48_0d1d2d3d4de 0.02038 1.0 0.12% triton_poi_fused__native_batch_norm_legit_functional_native_batch_norm_backward_10_0d1d2d3d4d5d6d7d8de 0.01974 1.0 0.11% triton_poi_fused_convolution_backward_124_0d1d2de 0.01818 1.0 0.10% triton_poi_fused__to_copy_11_0d1d2de 0.01765 6.0 0.10% triton_poi_fused_convolution_backward_52_0d1d2d3d4de 0.01612 1.0 0.09% triton_poi_fused_convolution_backward_125_0d1d2de 0.016 1.0 0.09% triton_poi_fused__to_copy_1_0d1d2de 0.01582 1.0 0.09% triton_poi_fused_convolution_backward_54_0d1d2d3d4de 0.01577 1.0 0.09% triton_poi_fused__to_copy_66_0d1d2de 0.0133 6.0 0.08% triton_poi_fused__to_copy_81_0d1d2de 0.01319 6.0 0.08% triton_poi_fused__to_copy_121_0d1d2de 0.01305 6.0 0.07% triton_poi_fused__to_copy_107_0d1d2de 0.01295 6.0 0.07% triton_poi_fused__to_copy_168_0d1d2de 0.01229 6.0 0.07% triton_poi_fused__to_copy_154_0d1d2de 0.01219 6.0 0.07% triton_poi_fused_convolution_backward_34_0d1d2de 0.01015 1.0 0.06% triton_poi_fused__to_copy_28_0d1d2de 0.00964 3.0 0.06% triton_poi_fused_convolution_backward_33_0d1d2de 0.00923 1.0 0.05% triton_poi_fused__to_copy_23_0d1d2 0.00851 3.0 0.05% triton_poi_fused__to_copy_116_0d1d2 0.00829 3.0 0.05% triton_poi_fused__to_copy_79_0d1d2e 0.00816 4.0 0.05% triton_poi_fused__to_copy_76_0d1d2e 0.0081 3.0 0.05% triton_poi_fused_add_fill_mul_sigmoid_sub_41_0d1d2de 0.00801 4.0 0.05% triton_poi_fused__to_copy_14_0d1d2de 0.00766 3.0 0.04% triton_poi_fused__to_copy_17_0d1d2de 0.00762 3.0 0.04% triton_poi_fused__to_copy_24_0d1d2 0.00691 3.0 0.04% triton_poi_fused__to_copy_117_0d1d2 0.00633 3.0 0.04% triton_poi_fused__to_copy_77_0d1d2e 0.00624 3.0 0.04% triton_poi_fused__to_copy_25_0d1d2 0.00623 3.0 0.04% triton_poi_fused__to_copy_164_0d1d2e 0.00619 3.0 0.04% triton_poi_fused__to_copy_78_0d1d2e 0.00611 3.0 0.04% triton_poi_fused__to_copy_118_0d1d2 0.0061 3.0 0.04% triton_poi_fused__to_copy_163_0d1d2e 0.00609 3.0 0.03% triton_poi_fused__to_copy_26_0d1d2 0.00607 3.0 0.03% triton_poi_fused_add_fill_mul_sigmoid_sub_15_0d1d2de 0.00604 3.0 0.03% triton_poi_fused__to_copy_119_0d1d2 0.00603 3.0 0.03% triton_poi_fused__to_copy_69_0d1d2de 0.00602 3.0 0.03% triton_poi_fused__to_copy_70_0d1d2de 0.006 3.0 0.03% triton_poi_fused__to_copy_109_0d1d2de 0.006 3.0 0.03% triton_poi_fused_add_fill_mul_sigmoid_sub_110_0d1d2de 0.006 3.0 0.03% triton_poi_fused__to_copy_112_0d1d2de 0.006 3.0 0.03% triton_poi_fused__to_copy_156_0d1d2de 0.006 3.0 0.03% triton_poi_fused_add_fill_mul_sigmoid_sub_157_0d1d2de 0.006 3.0 0.03% triton_poi_fused__to_copy_159_0d1d2de 0.006 3.0 0.03% triton_poi_fused__to_copy_205_0d1d2de 0.00411 2.0 0.02% triton_poi_fused__to_copy_212_0d1d2de 0.0041 2.0 0.02% triton_poi_fused__to_copy_233_0d1d2de 0.00405 2.0 0.02% triton_poi_fused__to_copy_216_0d1d2de 0.004 2.0 0.02% triton_poi_fused__to_copy_6_0d1d2de 0.00336 1.0 0.02% triton_poi_fused__to_copy_37_0d1d2de 0.00305 1.0 0.02% triton_poi_fused__to_copy_61_0d1d2de 0.003 1.0 0.02% triton_poi_fused__to_copy_88_0d1d2de 0.00299 1.0 0.02% triton_poi_fused__to_copy_188_0d1d2 0.0026 1.0 0.01% triton_poi_fused__to_copy_102_0d1d2de 0.00259 1.0 0.01% triton_poi_fused__to_copy_49_0d1d2de 0.00243 1.0 0.01% triton_poi_fused__to_copy_128_0d1d2de 0.00223 1.0 0.01% triton_poi_fused__to_copy_43_0d1d2de 0.00219 1.0 0.01% triton_poi_fused__to_copy_53_0d1d2de 0.00219 1.0 0.01% triton_poi_fused__to_copy_200_0d1d2de 0.00213 1.0 0.01% triton_poi_fused__to_copy_51_0d1d2de 0.00212 1.0 0.01% triton_poi_fused__to_copy_149_0d1d2de 0.00212 1.0 0.01% triton_poi_fused__to_copy_225_0d1d2de 0.00212 1.0 0.01% triton_poi_fused__to_copy_99_0d1d2de 0.0021 1.0 0.01% triton_poi_fused__to_copy_239_0d1d2de 0.0021 1.0 0.01% triton_poi_fused__to_copy_176_0d1d2de 0.00209 1.0 0.01% triton_poi_fused__to_copy_192_0d1d2 0.00208 1.0 0.01% triton_poi_fused__to_copy_40_0d1d2de 0.00207 1.0 0.01% triton_poi_fused__to_copy_223_0d1d2de 0.00203 1.0 0.01% triton_poi_fused__to_copy_245_0d1d2de 0.00203 1.0 0.01% triton_poi_fused__to_copy_94_0d1d2de 0.00202 1.0 0.01% triton_poi_fused__to_copy_194_0d1d2 0.00202 1.0 0.01% triton_poi_fused__to_copy_140_0d1d2de 0.00201 1.0 0.01% triton_poi_fused__to_copy_190_0d1d2 0.00201 1.0 0.01% triton_poi_fused__to_copy_55_0d1d2de 0.002 1.0 0.01% triton_poi_fused__to_copy_91_0d1d2de 0.002 1.0 0.01% triton_poi_fused_add_fill_mul_sigmoid_sub_92_0d1d2de 0.002 1.0 0.01% triton_poi_fused__to_copy_131_0d1d2de 0.002 1.0 0.01% triton_poi_fused_add_fill_mul_sigmoid_sub_132_0d1d2de 0.002 1.0 0.01% triton_poi_fused__to_copy_134_0d1d2de 0.002 1.0 0.01% triton_poi_fused__to_copy_142_0d1d2de 0.002 1.0 0.01% triton_poi_fused__to_copy_144_0d1d2de 0.002 1.0 0.01% triton_poi_fused__to_copy_179_0d1d2de 0.002 1.0 0.01% triton_poi_fused_add_fill_mul_sigmoid_sub_180_0d1d2de 0.002 1.0 0.01% triton_poi_fused__to_copy_182_0d1d2de 0.002 1.0 0.01% triton_poi_fused__to_copy_227_0d1d2de 0.002 1.0 0.01% triton_poi_fused__to_copy_242_0d1d2de 0.002 1.0 0.01% Total 38.2341 219.69% == triton_reduction category kernels == Kernel Self CUDA TIME (ms) Count Percent ------------------------------------------------------------------------------------------------------------------------ --------------------- ------- --------- triton_red_fused__native_batch_norm_legit_functional_cat_native_batch_norm_backward_threshold_backward_228_0d1d2d3d4d5d6 3.37879 1.0 19.41% triton_red_fused__native_batch_norm_legit_functional_cat_mul_native_batch_norm_backward_165_0d1d2d3d4d5d6d7de8de 1.48738 3.0 8.55% triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_161_0d1d2d3d4d5de6de 1.36472 3.0 7.84% triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_196_0d1d2d3d4d5de6de 1.31685 1.0 7.57% triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_114_0d1d2d3d4d5de6 1.26618 6.0 7.28% triton_red_fused__native_batch_norm_legit_functional_cat_native_batch_norm_backward_threshold_backward_206_0d1d2d3d4d5d6 1.08377 1.0 6.23% triton_red_fused__native_batch_norm_legit_functional_add_cat_native_batch_norm_backward_threshold_backward_243_0d1d2d3d4 1.06012 1.0 6.09% triton_red_fused_cat_mul_sigmoid_sigmoid_backward_silu_sum_155_0d1d2d3d4d5de6de 1.01631 3.0 5.84% triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_72_0d1d2d3d4d5de6de 0.94754 6.0 5.44% triton_red_fused__native_batch_norm_legit_functional_cat_native_batch_norm_backward_threshold_backward_218_0d1d2d3d4d5d6 0.88535 1.0 5.09% triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_19_0d1d2d3d4d5de6de 0.80786 6.0 4.64% triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_210_0d1d2d3d4d5d6de7 0.77106 1.0 4.43% triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_threshold_backward_240_0d1d2d3d4d5d6de7 0.71014 1.0 4.08% triton_red_fused__native_batch_norm_legit_functional_cat_native_batch_norm_backward_235_0d1d2d3d4d5d6de7 0.66306 1.0 3.81% triton_red_fused__native_batch_norm_legit_functional_cat_mul_native_batch_norm_backward_145_0d1d2d3d4d5d6d7d8de9de 0.48225 1.0 2.77% triton_red_fused_cat_mul_sigmoid_sigmoid_backward_silu_sum_108_0d1d2d3d4d5de6 0.44265 3.0 2.54% triton_red_fused__native_batch_norm_legit_functional_add_div_fill_mul_native_batch_norm_backward_sigmoid_sub_183_0d1d2d3 0.34281 1.0 1.97% triton_red_fused_cat_mul_sigmoid_sigmoid_backward_silu_sum_67_0d1d2d3d4d5de6 0.33165 3.0 1.91% triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_57_0d1d2d3d4d5de6de 0.30184 1.0 1.73% triton_red_fused__native_batch_norm_legit_functional_add_cat_native_batch_norm_backward_213_0d1d2d3d4d5d6d7de8 0.29247 1.0 1.68% triton_red_fused_mul_sigmoid_sigmoid_backward_silu_sum_177_0d1d2d3d4de5de 0.22488 1.0 1.29% triton_red_fused__native_batch_norm_legit_functional_add_div_fill_mul_native_batch_norm_backward_sigmoid_sub_95_0d1d2d3d 0.21659 1.0 1.24% triton_red_fused__native_batch_norm_legit_functional_mul_native_batch_norm_backward_100_0d1d2d3d4d5d6de7 0.21525 1.0 1.24% triton_red_fused__native_batch_norm_legit_functional_add_cat_native_batch_norm_backward_170_0d1d2d3d4d5d6d7de8de 0.16671 2.0 0.96% triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_201_0d1d2d3d4d5de6 0.14289 1.0 0.82% triton_red_fused_mul_sigmoid_sigmoid_backward_silu_sum_89_0d1d2d3d4de5 0.14243 1.0 0.82% triton_red_fused__native_batch_norm_legit_functional_add_div_fill_mul_native_batch_norm_backward_sigmoid_sub_135_0d1d2d3 0.1271 1.0 0.73% triton_red_fused__native_batch_norm_legit_functional_add_cat_native_batch_norm_backward_82_0d1d2d3d4d5d6d7de8de 0.11093 2.0 0.64% triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_150_0d1d2d3d4d5de6de 0.10621 2.0 0.61% triton_red_fused__native_batch_norm_legit_functional_add_div_fill_mul_native_batch_norm_backward_sigmoid_sub_44_0d1d2d3d 0.07709 1.0 0.44% triton_red_fused_mul_sigmoid_sigmoid_backward_silu_sum_129_0d1d2d3d4de5 0.07671 1.0 0.44% triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_62_0d1d2d3d4d5de6de 0.07283 2.0 0.42% triton_red_fused__native_batch_norm_legit_functional_div_native_batch_norm_backward_threshold_backward_2_0d1d2d3d4d5d6de 0.07264 1.0 0.42% triton_red_fused__native_batch_norm_legit_functional_add_cat_native_batch_norm_backward_122_0d1d2d3d4d5d6d7de8de 0.07209 2.0 0.41% triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_103_0d1d2d3d4d5de6de 0.04973 2.0 0.29% triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_35_0d1d2d3d4d5d6d7d8e9de 0.0424 1.0 0.24% triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_31_0d1d2d3d4d5d6d7e8de 0.03426 1.0 0.20% triton_red_fused__native_batch_norm_legit_functional_add_native_batch_norm_backward_29_0d1d2d3d4d5d6e7de 0.02449 1.0 0.14% triton_red_fused__native_batch_norm_legit_functional_add_div_fill_mul_native_batch_norm_backward_sigmoid_sub_97_0d1d2d3d 0.02427 8.0 0.14% triton_red_fused_add_div_fill_mul_native_batch_norm_backward_sigmoid_sub_96_0d1d2de3de 0.02412 8.0 0.14% triton_red_fused_cat_mul_native_batch_norm_backward_146_0d1d2de3de 0.0211 7.0 0.12% triton_red_fused__native_batch_norm_legit_functional_cat_mul_native_batch_norm_backward_147_0d1d2d3d4de5de 0.02107 7.0 0.12% triton_red_fused_native_batch_norm_backward_73_0d1d2de3 0.01809 6.0 0.10% triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_74_0d1d2d3d4de5 0.01805 6.0 0.10% triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_7_0d1d2d3d4d5e6de 0.01802 1.0 0.10% triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_152_0d1d2d3d4e5de 0.01213 4.0 0.07% triton_red_fused_native_batch_norm_backward_151_0d1d2e3de 0.01206 4.0 0.07% triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_64_0d1d2d3d4de5 0.012 4.0 0.07% triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_105_0d1d2d3d4e5 0.01199 4.0 0.07% triton_red_fused_native_batch_norm_backward_63_0d1d2de3 0.00921 4.0 0.05% triton_red_fused_native_batch_norm_backward_104_0d1d2e3 0.00909 4.0 0.05% triton_red_fused__native_batch_norm_legit_functional_cat_native_batch_norm_backward_237_0d1d2d3d4de5de 0.009 3.0 0.05% triton_red_fused_cat_native_batch_norm_backward_236_0d1d2de3de 0.00895 3.0 0.05% triton_red_fused__native_batch_norm_legit_functional_cat_native_batch_norm_backward_threshold_backward_208_0d1d2d3d4e5de 0.00797 2.0 0.05% triton_red_fused_cat_native_batch_norm_backward_threshold_backward_207_0d1d2e3de 0.00601 2.0 0.03% triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_203_0d1d2d3d4e5de 0.006 2.0 0.03% triton_red_fused_native_batch_norm_backward_202_0d1d2e3de 0.00594 2.0 0.03% triton_red_fused_native_batch_norm_backward_58_0d1d2de3 0.005 1.0 0.03% triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_59_0d1d2d3d4de5 0.005 1.0 0.03% triton_red_fused__native_batch_norm_legit_functional_cat_native_batch_norm_backward_threshold_backward_230_0d1d2d3d4de5d 0.005 1.0 0.03% triton_red_fused_cat_native_batch_norm_backward_threshold_backward_229_0d1d2de3de 0.00428 1.0 0.02% triton_red_fused__native_batch_norm_legit_functional_add_div_fill_mul_native_batch_norm_backward_sigmoid_sub_185_0d1d2d3 0.00398 1.0 0.02% triton_red_fused__native_batch_norm_legit_functional_cat_native_batch_norm_backward_threshold_backward_220_0d1d2d3d4de5d 0.00308 1.0 0.02% triton_red_fused_add_div_fill_mul_native_batch_norm_backward_sigmoid_sub_136_0d1d2de3 0.00301 1.0 0.02% triton_red_fused_add_div_fill_mul_native_batch_norm_backward_sigmoid_sub_184_0d1d2de3e 0.00301 1.0 0.02% triton_red_fused__to_copy_sum_0_0d1d2e3de 0.003 1.0 0.02% triton_red_fused__native_batch_norm_legit_functional_add_div_fill_mul_native_batch_norm_backward_sigmoid_sub_137_0d1d2d3 0.003 1.0 0.02% triton_red_fused_native_batch_norm_backward_197_0d1d2de3 0.003 1.0 0.02% triton_red_fused__native_batch_norm_legit_functional_native_batch_norm_backward_198_0d1d2d3d4de5 0.003 1.0 0.02% triton_red_fused_cat_native_batch_norm_backward_threshold_backward_219_0d1d2de3de 0.003 1.0 0.02% Total 21.2305 121.99% == triton_persistent_reduction category kernels == Kernel Self CUDA TIME (ms) Count Percent ------------------------------------------------------------------------------------------------------------------------ --------------------- ------- --------- triton_per_fused_cat_mul_sigmoid_sigmoid_backward_silu_sum_12_0d1d2d3d4d5de6 0.27697 3.0 1.59% triton_per_fused_mul_sigmoid_sigmoid_backward_silu_sum_38_0d1d2d3d4de5 0.05946 1.0 0.34% triton_per_fused_native_batch_norm_backward_20_0d1d2de3 0.01803 6.0 0.10% triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_21_0d1d2d3d4de5 0.01769 6.0 0.10% triton_per_fused__to_copy_convolution_backward_13_0d1d2de3de 0.012 3.0 0.07% triton_per_fused__to_copy_convolution_backward_130_0d1d2de3de 0.01199 4.0 0.07% triton_per_fused__to_copy_convolution_backward_42_0d1d2de3de 0.00827 4.0 0.05% triton_per_fused__native_batch_norm_legit_functional_native_batch_norm_backward_9_0d1d2d3d4e5 0.00815 4.0 0.05% triton_per_fused__to_copy_convolution_backward_90_0d1d2de3de 0.00813 4.0 0.05% triton_per_fused_native_batch_norm_backward_8_0d1d2e3 0.00809 4.0 0.05% triton_per_fused__to_copy_convolution_backward_16_0d1d23de 0.00762 3.0 0.04% triton_per_fused__to_copy_convolution_backward_68_0d1d2de3de 0.00619 3.0 0.04% triton_per_fused__to_copy_convolution_backward_111_0d1d23de 0.00605 3.0 0.03% triton_per_fused__to_copy_convolution_backward_158_0d1d23de 0.006 3.0 0.03% triton_per_fused__to_copy_convolution_backward_39_0d1d2de3de 0.00401 1.0 0.02% triton_per_fused_div_native_batch_norm_backward_threshold_backward_3_0d1d2de3 0.003 1.0 0.02% triton_per_fused__to_copy_convolution_backward_178_0d1d2de3de 0.00298 1.0 0.02% triton_per_fused__native_batch_norm_legit_functional_div_native_batch_norm_backward_threshold_backward_4_0d1d2d3d4de5 0.00291 1.0 0.02% triton_per_fused__native_batch_norm_legit_functional_add_div_fill_mul_native_batch_norm_backward_sigmoid_sub_46_0d1d2d3d 0.00213 1.0 0.01% triton_per_fused_add_div_fill_mul_native_batch_norm_backward_sigmoid_sub_45_0d1d2de3 0.00207 1.0 0.01% triton_per_fused__to_copy_convolution_backward_93_0d1d23de 0.00203 1.0 0.01% triton_per_fused__to_copy_convolution_backward_133_0d1d23de 0.002 1.0 0.01% triton_per_fused__to_copy_convolution_backward_181_0d1d23de 0.002 1.0 0.01% Total 0.47777 2.75% == unknown category kernels == Kernel Self CUDA TIME (ms) Count Percent ------------------------------------------------------------------------------------------------------------------------ --------------------- ------- --------- sm80_xmma_wgrad_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage4_warpsize1x1x4_g16_tensor16 19.9738 11.0 114.77% void dgrad2d_c1_k1_nhwc_kernel_specialized_window<__half, float, float, 7, 2, true>(float, cudnnTensorStruct, __half con 19.6497 4.0 112.91% void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast(float, cudnnTensorStruct, __half const*, cudnnFilterStruct, _ 9.69776 11.0 55.72% void dgrad2d_c1_k1_nhwc_kernel_specialized_window<__half, float, float, 5, 2, true>(float, cudnnTensorStruct, __half con 7.13334 4.0 40.99% void tensorTransformGeneric<__half, __half, float, true, false, false, (cudnnKernelDataType_t)0>(cudnnTensorTransformStr 6.28859 39.0 36.13% void dgrad2d_c1_k1_nhwc_kernel_specialized_window<__half, float, float, 3, 2, true>(float, cudnnTensorStruct, __half con 2.92674 4.0 16.82% void cutlass::Kernel(cutlass_80_wmma_tensorop_f16_s16 1.92832 26.0 11.08% sm80_xmma_wgrad_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage4_warpsize2x2x1_g1_tensor16x 1.9164 2.0 11.01% Memset (Device) 1.87908 103.0 10.80% void dgrad2d_c1_k1_nhwc_kernel_specialized_window<__half, float, float, 3, 1, true>(float, cudnnTensorStruct, __half con 1.63006 15.0 9.37% sm80_xmma_gemm_f16f16_f16f32_f32_nn_n_tilesize96x64x32_stage4_warpsize2x2x1_tensor16x8x16_kernel 1.38101 2.0 7.94% void cudnn::cnn::wgrad2d_c1_k1_nhwc_kernel<__half, float, 1, 7, 7, 2, 2, 1, 1, true>(cudnnTensorStruct, __half const*, c 1.3353 4.0 7.67% sm80_xmma_wgrad_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x32x64_stage5_warpsize2x2x1_g1_tensor16x 1.10164 3.0 6.33% void dgrad2d_c1_k1_nhwc_kernel_specialized_window<__half, float, float, 5, 1, true>(float, cudnnTensorStruct, __half con 1.01509 12.0 5.83% void xmma_cudnn::ext::depthwise_convolution::wgrad::kernel(float, cudnnTensorStruct, __half con 0.92755 9.0 5.33% void cutlass::Kernel(cutlass_80_tensorop_f16_s16816gemm_f1 0.86364 12.0 4.96% void xmma_cudnn::ext::depthwise_convolution::wgrad::kernel(cudnnTensorStruct, __half const*, c 0.71126 9.0 4.09% void cutlass::Kernel(cutlass_80_wmma_tensorop_f16_s16 0.65119 10.0 3.74% void cutlass::Kernel(cutlass_80_tensorop_f16_s16816gemm_f1 0.61326 12.0 3.52% void cutlass_cudnn::Kernel(cutlass 0.58397 13.0 3.36% void cutlass::Kernel(cutlass_80_tensorop_f16_s16816gemm_f1 0.55323 6.0 3.18% sm80_xmma_dgrad_implicit_gemm_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x128x32_stage5_warpsize2x2x1_g1_tensor16x8x16_ex 0.54763 4.0 3.15% void xmma_cudnn::ext::depthwise_convolution::wgrad::kernel(cutlass_80_wmma_tensorop_f16_s16 0.42212 1.0 2.43% void cutlass::Kernel(cutlass_80_tensorop_f16_s16816gemm_f1 0.42103 6.0 2.42% void cutlass::Kernel(cutlass_80_wmma_tensorop_s161616gem 0.3949 2.0 2.27% ampere_fp16_s16816gemm_fp16_64x128_ldg8_f2f_stages_64x4_nt 0.37536 3.0 2.16% sm80_xmma_wgrad_implicit_gemm_indexed_wo_smem_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize32x16x64_stage1_warpsize2x1x1_g1_t 0.37321 1.0 2.14% sm80_xmma_gemm_f16f16_f16f32_f32_nn_n_tilesize96x64x32_stage3_warpsize2x2x1_tensor16x8x16_kernel 0.35413 1.0 2.03% sm80_xmma_gemm_f16f16_f16f32_f32_nn_n_tilesize96x128x32_stage4_warpsize2x2x1_tensor16x8x16_kernel 0.32786 6.0 1.88% void cudnn::cnn::wgrad2d_c1_k1_nhwc_kernel<__half, float, 1, 5, 5, 1, 1, 1, 1, true>(cudnnTensorStruct, __half const*, c 0.30981 6.0 1.78% void cutlass::Kernel(cutlass_75_tensorop_f16_s1688gemm_f16_64x64_ 0.27487 2.0 1.58% sm80_xmma_dgrad_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage4_warpsize2x2x1_g1_tensor16x 0.27266 2.0 1.57% void cutlass::Kernel(cutlass_80_tensorop_f16_s16816gemm_f16 0.27198 2.0 1.56% ampere_fp16_s16816gemm_fp16_128x128_ldg8_f2f_stages_64x3_nt 0.27086 6.0 1.56% void cutlass::Kernel(cutlass_80_tensorop_f16_s16816ge 0.26192 6.0 1.50% void cudnn::cnn::wgrad2d_c1_k1_nhwc_kernel<__half, float, 1, 3, 3, 1, 1, 1, 1, true>(cudnnTensorStruct, __half const*, c 0.25286 6.0 1.45% _ZN2at6native54_GLOBAL__N__d8ceb000_21_DistributionNormal_cu_0c5b6e8543distribution_elementwise_grid_stride_kernelIfLi4E 0.24742 4.35 1.42% ampere_fp16_s1688gemm_fp16_128x128_ldg8_f2f_stages_32x1_nn 0.24647 2.0 1.42% sm80_xmma_wgrad_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x128x32_stage5_warpsize2x2x1_g1_tensor16 0.2274 2.0 1.31% void splitKreduce_kernel<32, 16, int, __half, __half, float, __half, true, false, false>(cublasSplitKParams, __ha 0.22268 54.0 1.28% void cudnn::cnn::wgrad2d_c1_k1_nhwc_kernel<__half, float, 1, 5, 5, 2, 2, 1, 1, true>(cudnnTensorStruct, __half const*, c 0.22122 1.0 1.27% void cudnn::cnn::wgrad2d_c1_k1_nhwc_kernel<__half, float, 1, 3, 3, 2, 2, 1, 1, true>(cudnnTensorStruct, __half const*, c 0.18944 1.0 1.09% void cudnn::cnn::wgrad2d_c1_k1_nhwc_reduction_kernel<__half, float, 1, 7, 7>(cudnnTensorStruct, cudnnConvolutionStruct, 0.18237 13.0 1.05% ampere_fp16_s16816gemm_fp16_128x64_ldg8_f2f_stages_64x3_nt 0.13325 1.0 0.77% void cutlass::Kernel(cutlass_80_tensorop_f16_s16816g 0.08404 1.0 0.48% void xmma_cudnn::ext::depthwise_convolution::wgrad::kernel Device) 0.06774 32.0 0.39% void cudnn::cnn::wgrad2d_c1_k1_nhwc_reduction_kernel<__half, float, 1, 5, 5>(cudnnTensorStruct, cudnnConvolutionStruct, 0.06738 7.0 0.39% void xmma_cudnn::ext::depthwise_convolution::wgrad::kernel(int, int, int, int, int, int, int, int, 0.04806 6.0 0.28% void cutlass::Kernel(cutlass_80_wmma_tensorop_f16_s1 0.04399 8.0 0.25% void cudnn::ops::nhwcToNchwKernel<__half, __half, float, true, false, (cudnnKernelDataType_t)0>(cudnn::ops::nhwc2nchw_pa 0.04297 13.0 0.25% sm80_xmma_wgrad_implicit_gemm_indexed_wo_smem_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize32x32x64_stage1_warpsize2x2x1_g1_t 0.04281 5.0 0.25% void xmma_cudnn::ext::depthwise_convolution::wgrad::kernel(cudnnTensorStruct, cudnnConvolutionStruct, 0.04093 7.0 0.24% void xmma_cudnn::ext::depthwise_convolution::wgrad::kernel(cudnn::ops::nchw2nhwc_pa 0.03787 13.0 0.22% void cutlass::Kernel(cutlass_80_tensorop_f16_s16816gemm_f16 0.03 3.0 0.17% sm80_xmma_gemm_f16f16_f16f32_f32_nn_n_tilesize32x32x64_stage6_warpsize2x2x1_tensor16x8x16_kernel 0.02117 4.0 0.12% void splitKreduce_kernel<32, 16, int, float, __half, float, __half, true, false, false>(cublasSplitKParams, float 0.01796 2.0 0.10% ampere_fp16_s16816gemm_fp16_256x128_ldg8_f2f_stages_32x3_nt 0.017 1.0 0.10% void cutlass::Kernel(cutlass_80_wmma_tensorop_f16_s16 0.01649 4.0 0.09% ampere_fp16_s16816gemm_fp16_64x64_ldg8_f2f_stages_64x5_nn 0.01514 1.0 0.09% sm80_xmma_wgrad_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x64x64_stage4_warpsize2x2x1_g1_tensor16x 0.01504 3.0 0.09% sm80_xmma_wgrad_implicit_gemm_indexed_f16f16_f16f32_f32_nhwckrsc_nhwc_tilesize64x32x64_stage5_warpsize2x2x1_g1_tensor16x 0.01347 1.0 0.08% void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, at::detail::Array >(int, at:: 0.00902 0.04 0.05% void cutlass::Kernel(cutlass_80_wmma_tensorop_f16_s1 0.008 1.0 0.05% _ZN2at6native54_GLOBAL__N__d8ceb000_21_DistributionNormal_cu_0c5b6e8543distribution_elementwise_grid_stride_kernelIfLi4E 0.00443 1.74 0.03% Total 114.288 656.69% Percent of time when GPU is busy: 1001.12% Total wall time 17.404 ms Output for tabulate: mixnet_l, 219.69%, 121.99%, 2.75%, 0.00%, 656.69%, 1001.12%, 17.404ms ======================================================================================== =================== Profile for individual generated triton kernel =========================== ======================================================================================== $> cp /tmp/torchinductor_ubuntu/3n/c3ngrgycuw4nfpqwzjxewdrpdkke74sa5k2tpslhy2o3e3qu5bmc.py k.py $> python k.py /opt/conda/envs/nllm3/lib/python3.10/site-packages/transformers/utils/generic.py:441: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead. _torch_pytree._register_pytree_node( 0.004ms 0.000GB 1.23GB/s $> TORCHINDUCTOR_MAX_AUTOTUNE=1 python k.py /opt/conda/envs/nllm3/lib/python3.10/site-packages/transformers/utils/generic.py:441: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead. _torch_pytree._register_pytree_node( 0.004ms 0.000GB 1.21GB/s