|
106 | 106 |
|
107 | 107 | ######################################################################
|
108 | 108 | # The output will look like (omitting some columns):
|
109 |
| - |
110 |
| -# --------------------------------- ------------ ------------ ------------ ------------ |
111 |
| -# Name Self CPU CPU total CPU time avg # of Calls |
112 |
| -# --------------------------------- ------------ ------------ ------------ ------------ |
113 |
| -# model_inference 5.509ms 57.503ms 57.503ms 1 |
114 |
| -# aten::conv2d 231.000us 31.931ms 1.597ms 20 |
115 |
| -# aten::convolution 250.000us 31.700ms 1.585ms 20 |
116 |
| -# aten::_convolution 336.000us 31.450ms 1.573ms 20 |
117 |
| -# aten::mkldnn_convolution 30.838ms 31.114ms 1.556ms 20 |
118 |
| -# aten::batch_norm 211.000us 14.693ms 734.650us 20 |
119 |
| -# aten::_batch_norm_impl_index 319.000us 14.482ms 724.100us 20 |
120 |
| -# aten::native_batch_norm 9.229ms 14.109ms 705.450us 20 |
121 |
| -# aten::mean 332.000us 2.631ms 125.286us 21 |
122 |
| -# aten::select 1.668ms 2.292ms 8.988us 255 |
123 |
| -# --------------------------------- ------------ ------------ ------------ ------------ |
124 |
| -# Self CPU time total: 57.549m |
| 109 | +# |
| 110 | +# .. code-block:: sh |
| 111 | +# |
| 112 | +# --------------------------------- ------------ ------------ ------------ ------------ |
| 113 | +# Name Self CPU CPU total CPU time avg # of Calls |
| 114 | +# --------------------------------- ------------ ------------ ------------ ------------ |
| 115 | +# model_inference 5.509ms 57.503ms 57.503ms 1 |
| 116 | +# aten::conv2d 231.000us 31.931ms 1.597ms 20 |
| 117 | +# aten::convolution 250.000us 31.700ms 1.585ms 20 |
| 118 | +# aten::_convolution 336.000us 31.450ms 1.573ms 20 |
| 119 | +# aten::mkldnn_convolution 30.838ms 31.114ms 1.556ms 20 |
| 120 | +# aten::batch_norm 211.000us 14.693ms 734.650us 20 |
| 121 | +# aten::_batch_norm_impl_index 319.000us 14.482ms 724.100us 20 |
| 122 | +# aten::native_batch_norm 9.229ms 14.109ms 705.450us 20 |
| 123 | +# aten::mean 332.000us 2.631ms 125.286us 21 |
| 124 | +# aten::select 1.668ms 2.292ms 8.988us 255 |
| 125 | +# --------------------------------- ------------ ------------ ------------ ------------ |
| 126 | +# Self CPU time total: 57.549m |
125 | 127 | #
|
126 | 128 |
|
127 | 129 | ######################################################################
|
|
210 | 212 | # Self CPU time total: 23.015ms
|
211 | 213 | # Self CUDA time total: 11.666ms
|
212 | 214 | #
|
213 |
| -###################################################################### |
214 |
| - |
215 | 215 |
|
216 | 216 | ######################################################################
|
217 | 217 | # (Note: the first use of XPU profiling may bring an extra overhead.)
|
|
221 | 221 | #
|
222 | 222 | # .. code-block:: sh
|
223 | 223 | #
|
224 |
| -#------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ |
225 |
| -# Name Self XPU Self XPU % XPU total XPU time avg # of Calls |
226 |
| -# ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ |
227 |
| -# model_inference 0.000us 0.00% 2.567ms 2.567ms 1 |
228 |
| -# aten::conv2d 0.000us 0.00% 1.871ms 93.560us 20 |
229 |
| -# aten::convolution 0.000us 0.00% 1.871ms 93.560us 20 |
230 |
| -# aten::_convolution 0.000us 0.00% 1.871ms 93.560us 20 |
231 |
| -# aten::convolution_overrideable 1.871ms 72.89% 1.871ms 93.560us 20 |
232 |
| -# gen_conv 1.484ms 57.82% 1.484ms 74.216us 20 |
233 |
| -# aten::batch_norm 0.000us 0.00% 432.640us 21.632us 20 |
234 |
| -# aten::_batch_norm_impl_index 0.000us 0.00% 432.640us 21.632us 20 |
235 |
| -# aten::native_batch_norm 432.640us 16.85% 432.640us 21.632us 20 |
236 |
| -# conv_reorder 386.880us 15.07% 386.880us 6.448us 60 |
237 |
| -# ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ |
238 |
| -# Self CPU time total: 712.486ms |
239 |
| -# Self XPU time total: 2.567ms |
240 |
| - |
| 224 | +# ------------------------------ ------------ ------------ ------------ ------------ ------------ |
| 225 | +# Name Self XPU Self XPU % XPU total XPU time avg # of Calls |
| 226 | +# ------------------------------ ------------ ------------ ------------ ------------ ------------ |
| 227 | +# model_inference 0.000us 0.00% 2.567ms 2.567ms 1 |
| 228 | +# aten::conv2d 0.000us 0.00% 1.871ms 93.560us 20 |
| 229 | +# aten::convolution 0.000us 0.00% 1.871ms 93.560us 20 |
| 230 | +# aten::_convolution 0.000us 0.00% 1.871ms 93.560us 20 |
| 231 | +# aten::convolution_overrideable 1.871ms 72.89% 1.871ms 93.560us 20 |
| 232 | +# gen_conv 1.484ms 57.82% 1.484ms 74.216us 20 |
| 233 | +# aten::batch_norm 0.000us 0.00% 432.640us 21.632us 20 |
| 234 | +# aten::_batch_norm_impl_index 0.000us 0.00% 432.640us 21.632us 20 |
| 235 | +# aten::native_batch_norm 432.640us 16.85% 432.640us 21.632us 20 |
| 236 | +# conv_reorder 386.880us 15.07% 386.880us 6.448us 60 |
| 237 | +# ------------------------------ ------------ ------------ ------------ ------------ ------------ |
| 238 | +# Self CPU time total: 712.486ms |
| 239 | +# Self XPU time total: 2.567ms |
241 | 240 | #
|
242 | 241 |
|
243 |
| - |
244 | 242 | ######################################################################
|
245 |
| -# Note the occurrence of on-device kernels in the output (e.g. ``sgemm_32x32x32_NN``). |
| 243 | +# Note the occurrence of on-device kernels in the output (e.g. ``sgemm_32x32x32_NN`` for CUDA or ``gen_conv`` for XPU). |
246 | 244 |
|
247 | 245 | ######################################################################
|
248 | 246 | # 4. Using profiler to analyze memory consumption
|
|
0 commit comments