-
Notifications
You must be signed in to change notification settings - Fork 35
/
Copy pathless_slow_sm80.ptx
378 lines (313 loc) · 13.5 KB
/
less_slow_sm80.ptx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
/**
* less_slow_sm80.ptx
*
* Micro-kernels for building a performance-first mindset for CUDA-capable
* GPUs using Parallel Thread eXecution (PTX) Intermediate Representation (IR)
* for for Ampere-generation Nvidia GPUs with Warp-level MMA (WMMA).
*
* ? You should start at `less_slow.cu` before reading this file.
* ? You should start at `less_slow_sm70.ptx` before reading this file.
* ? Also read intro to PTX: https://docs.nvidia.com/cuda/parallel-thread-execution/
* ? Check the PTX ISA: https://docs.nvidia.com/cuda/pdf/ptx_isa_8.5.pdf
*
* You can validate this file by asking the Nvidia PTX Assembler to compile it
* to `.cubin` for some target architecture:
*
* $ ptxas -o less_slow_sm80_from_ptx.cubin -arch=sm_80 less_slow_sm80.ptx
* $ cuobjdump -sass less_slow_sm80_from_ptx.cubin | grep -i mma
*
* Assuming how aggressively NVCC unrolls loops and the number of kernels in
* this file, you may want to deduplicate them:
*
* $ cuobjdump -sass less_slow_sm80_from_ptx.cubin | grep -i mma | \
* $ sed -r 's/\/\*[^*]+\*\///g' | \
* $ sed -r 's/^[[:space:]]+//; s/[[:space:]]+$//' | \
* $ sort -u
*/
.version 7.0 // PTX version 7.0 for Ampere GPUs
.target sm_80 // Target architecture (SM_80 - Ampere GPUs)
.address_size 64 // 64-bit addressing
/**
* Let's define some global memory buffers, visible on both device and host
* side, to output multiplication results.
*/
.visible .global .align 8 .f64 dummy_sink_f64[32];
.visible .global .align 4 .s32 dummy_sink_s32[32];
.visible .global .align 4 .f32 dummy_sink_f32[32];
/**
* Our previous Volta kernel should work just fine here, but we can make it
* nicer by using the `<>` syntax to define many virtual registers without
* explicitly naming them! We can also explicitly define them as `.f16x2` to
* constrain the registers to packed half-precision pairs.
*
* We can also scale from a Quadpair-level MMA to the Warp-level WMMA,
* synchronizing more threads to process larger tiles, as the PTX docs
* explicitly warn against using the `mma.sync.m8n8k4` to avoid performance
* issues!
*/
.visible .entry tops_f16f16_sm80wmma_16x16x16_loop128_ptx_kernel()
{
// Accumulator registers used for both input and output of the MMA operation
// https://docs.nvidia.com/cuda/parallel-thread-execution/#parameterized-variable-names
.reg .b32 accum<4>;
// Registers to hold packed 16-bit data for matrix A (8 registers)
.reg .f16x2 matrix_a<8>;
// Registers to hold packed 16-bit data for matrix B (8 registers)
.reg .f16x2 matrix_b<8>;
// General-purpose registers for loop control and constant values
.reg .b32 loop_counter, loop_limit, packed_const;
// Predicate register for conditional branching (loop exit)
.reg .pred exit_predicate;
// Set up loop counter and loop limit
mov.u32 loop_counter, 0;
mov.u32 loop_limit, 128;
// Zero-initialize the accumulators, as registers may contain noise
// https://docs.nvidia.com/cuda/parallel-thread-execution/#state-spaces
mov.f32 accum0, 0.0;
mov.f32 accum1, 0.0;
mov.f32 accum2, 0.0;
mov.f32 accum3, 0.0;
// Initialize constant for packed matrix data (placeholder)
mov.b32 packed_const, 0x00010001;
// Initialize matrix a registers with the packed constant
mov.b32 matrix_a0, packed_const;
mov.b32 matrix_a1, packed_const;
mov.b32 matrix_a2, packed_const;
mov.b32 matrix_a3, packed_const;
mov.b32 matrix_a4, packed_const;
mov.b32 matrix_a5, packed_const;
mov.b32 matrix_a6, packed_const;
mov.b32 matrix_a7, packed_const;
// Initialize matrix b registers with the packed constant
mov.b32 matrix_b0, packed_const;
mov.b32 matrix_b1, packed_const;
mov.b32 matrix_b2, packed_const;
mov.b32 matrix_b3, packed_const;
mov.b32 matrix_b4, packed_const;
mov.b32 matrix_b5, packed_const;
mov.b32 matrix_b6, packed_const;
mov.b32 matrix_b7, packed_const;
// The main loop will repeat for 128 iterations
loop_start:
setp.ge.u32 exit_predicate, loop_counter, loop_limit;
@exit_predicate bra loop_exit;
wmma.mma.sync.aligned.row.col.m16n16k16.f16.f16
{ accum0, accum1, accum2, accum3 },
{ matrix_a0, matrix_a1, matrix_a2, matrix_a3,
matrix_a4, matrix_a5, matrix_a6, matrix_a7 },
{ matrix_b0, matrix_b1, matrix_b2, matrix_b3,
matrix_b4, matrix_b5, matrix_b6, matrix_b7 },
{ accum0, accum1, accum2, accum3 };
// Increment the loop counter
add.u32 loop_counter, loop_counter, 1;
// Branch back to the beginning of the loop
bra loop_start;
loop_exit:
// This barrier forces all asynchronous warp-group operations to complete.
bar.sync 0;
// Use volatile stores to force the accumulator values to be written out.
// This dummy write (to a global variable) makes the work observable and
// prevents the multiplication pipeline from being optimized out.
st.global.volatile.f32 [dummy_sink_f32], accum0;
st.global.volatile.f32 [dummy_sink_f32+4], accum1;
st.global.volatile.f32 [dummy_sink_f32+8], accum2;
st.global.volatile.f32 [dummy_sink_f32+12], accum3;
ret;
}
.visible .entry tops_f16f32_sm80wmma_16x16x16_loop128_ptx_kernel()
{
// Accumulator registers used for both input and output of the MMA operation
// https://docs.nvidia.com/cuda/parallel-thread-execution/#parameterized-variable-names
.reg .b32 accum<8>;
// Registers to hold packed 16-bit data for matrix A (8 registers)
.reg .f16x2 matrix_a<8>;
// Registers to hold packed 16-bit data for matrix B (8 registers)
.reg .f16x2 matrix_b<8>;
// General-purpose registers for loop control and constant values
.reg .b32 loop_counter, loop_limit, packed_const;
// Predicate register for conditional branching (loop exit)
.reg .pred exit_predicate;
// Set up loop counter and loop limit
mov.u32 loop_counter, 0;
mov.u32 loop_limit, 128;
// Zero-initialize the accumulators, as registers may contain noise
// https://docs.nvidia.com/cuda/parallel-thread-execution/#state-spaces
mov.f32 accum0, 0.0;
mov.f32 accum1, 0.0;
mov.f32 accum2, 0.0;
mov.f32 accum3, 0.0;
// Initialize constant for packed matrix data (placeholder)
mov.b32 packed_const, 0x00010001;
// Initialize matrix a registers with the packed constant
mov.b32 matrix_a0, packed_const;
mov.b32 matrix_a1, packed_const;
mov.b32 matrix_a2, packed_const;
mov.b32 matrix_a3, packed_const;
mov.b32 matrix_a4, packed_const;
mov.b32 matrix_a5, packed_const;
mov.b32 matrix_a6, packed_const;
mov.b32 matrix_a7, packed_const;
// Initialize matrix b registers with the packed constant
mov.b32 matrix_b0, packed_const;
mov.b32 matrix_b1, packed_const;
mov.b32 matrix_b2, packed_const;
mov.b32 matrix_b3, packed_const;
mov.b32 matrix_b4, packed_const;
mov.b32 matrix_b5, packed_const;
mov.b32 matrix_b6, packed_const;
mov.b32 matrix_b7, packed_const;
// The main loop will repeat for 128 iterations
loop_start:
setp.ge.u32 exit_predicate, loop_counter, loop_limit;
@exit_predicate bra loop_exit;
wmma.mma.sync.aligned.row.col.m16n16k16.f32.f32
{ accum0, accum1, accum2, accum3,
accum4, accum5, accum6, accum7 },
{ matrix_a0, matrix_a1, matrix_a2, matrix_a3,
matrix_a4, matrix_a5, matrix_a6, matrix_a7 },
{ matrix_b0, matrix_b1, matrix_b2, matrix_b3,
matrix_b4, matrix_b5, matrix_b6, matrix_b7 },
{ accum0, accum1, accum2, accum3,
accum4, accum5, accum6, accum7 };
// Increment the loop counter
add.u32 loop_counter, loop_counter, 1;
// Branch back to the beginning of the loop
bra loop_start;
loop_exit:
// This barrier forces all asynchronous warp-group operations to complete.
bar.sync 0;
// Use volatile stores to force the accumulator values to be written out.
// This dummy write (to a global variable) makes the work observable and
// prevents the multiplication pipeline from being optimized out.
st.global.volatile.f32 [dummy_sink_f32], accum0;
st.global.volatile.f32 [dummy_sink_f32+4], accum1;
st.global.volatile.f32 [dummy_sink_f32+8], accum2;
st.global.volatile.f32 [dummy_sink_f32+12], accum3;
ret;
}
/**
* Each new generation of Tensor Cores supports a wider palette of numeric
* types, "structured sparsity" modes, and asynchronous scheduling protocols.
*
* ! For double-precision numbers, the smallest granularity is 8x8x4.
* ! Technically, it requires SM 8.0, but it's not a Warp-level MMA operation.
* ! It's Quadpair-level MMA operation!
*/
.visible .entry tops_f64f64_sm80mma_8x8x4_loop128_ptx_kernel()
{
// Registers to hold matrix A and B operands (each a single f64)
.reg .f64 matrix_a, matrix_b;
// Additive bias and accumulator registers used for both input and output of the MMA operation
.reg .f64 bias<2>, accum<2>;
// General-purpose registers for loop control
.reg .b32 loop_counter, loop_limit;
// Predicate register for conditional branching (loop exit)
.reg .pred exit_predicate;
// Set up loop counter and loop limit
mov.u32 loop_counter, 0;
mov.u32 loop_limit, 128;
// Zero-initialize the accumulator registers
mov.f64 accum0, 0.0;
mov.f64 accum1, 0.0;
// Initialize matrix A and B operands to one (double precision)
mov.f64 matrix_a, 1.0;
mov.f64 matrix_b, 1.0;
// Initialize the additive bias to zero
mov.f64 bias0, 0.0;
mov.f64 bias1, 0.0;
// The main loop will repeat for 128 iterations
loop_start:
setp.ge.u32 exit_predicate, loop_counter, loop_limit;
@exit_predicate bra loop_exit;
mma.sync.aligned.m8n8k4.row.col.f64.f64.f64.f64
{ accum0, accum1 },
{ matrix_a },
{ matrix_b },
{ bias0, bias1 };
// Increment the loop counter
add.u32 loop_counter, loop_counter, 1;
// Branch back to the beginning of the loop
bra loop_start;
loop_exit:
// This barrier forces all asynchronous warp-group operations to complete.
bar.sync 0;
// Use volatile stores to force the accumulator values to be written out.
// This dummy write (to a global variable) makes the work observable and
// prevents the multiplication pipeline from being optimized out.
st.global.volatile.f64 [dummy_sink_f64], accum0;
st.global.volatile.f64 [dummy_sink_f64+8], accum1;
ret;
}
/**
* Ridiculously, there are no Warp-Group-Asynchronous double-precision
* variants, but every other variant is present:
*
* - `b1` operands wit `s32` accumulators
* - `u8` and `s8` operands with `s32` accumulators
* - `e4m3` and `e5m2` operands with `f16` and `f32` accumulators
* - `f16` operands with `f16` and `f32` accumulators
* - `bf16` operands with `f32` accumulators
* - `tf32` operands with `f32` accumulators
*
* So the highest-precision properly accelerated type is the `tf32`, which
* is confusingly 19 bits wide! The synchronous variant would look familiar:
*/
.visible .entry tops_tf32f32_sm80wmma_16x16x8_loop128_ptx_kernel()
{
// Accumulator registers used for both input and output of the MMA operation
.reg .b32 accum<8>;
// Registers to hold packed 16-bit data for matrix A
.reg .b32 matrix_a<4>;
// Registers to hold packed 16-bit data for matrix B
.reg .b32 matrix_b<4>;
// General-purpose registers for loop control and constant values
.reg .b32 loop_counter, loop_limit, packed_const;
// Predicate register for conditional branching (loop exit)
.reg .pred exit_predicate;
// Set up loop counter and loop limit
mov.u32 loop_counter, 0;
mov.u32 loop_limit, 128;
// Zero-initialize the accumulator registers
mov.f32 accum0, 0.0;
mov.f32 accum1, 0.0;
mov.f32 accum2, 0.0;
mov.f32 accum3, 0.0;
mov.f32 accum4, 0.0;
mov.f32 accum5, 0.0;
mov.f32 accum6, 0.0;
mov.f32 accum7, 0.0;
// Initialize constant for packed matrix data (placeholder)
mov.b32 packed_const, 0x00010001;
// Initialize matrix a registers with the packed constant
mov.b32 matrix_a0, packed_const;
mov.b32 matrix_a1, packed_const;
// Initialize matrix b registers with the packed constant
mov.b32 matrix_b0, packed_const;
mov.b32 matrix_b1, packed_const;
// The main loop will repeat for 128 iterations
loop_start:
setp.ge.u32 exit_predicate, loop_counter, loop_limit;
@exit_predicate bra loop_exit;
wmma.mma.sync.aligned.m16n16k8.row.col.f32.tf32.tf32.f32
{ accum0, accum1, accum2, accum3,
accum4, accum5, accum6, accum7 },
{ matrix_a0, matrix_a1, matrix_a2, matrix_a3 },
{ matrix_b0, matrix_b1, matrix_b2, matrix_b3 },
{ accum0, accum1, accum2, accum3,
accum4, accum5, accum6, accum7 };
// Increment the loop counter
add.u32 loop_counter, loop_counter, 1;
// Branch back to the beginning of the loop
bra loop_start;
loop_exit:
// This barrier forces all asynchronous warp-group operations to complete.
bar.sync 0;
// Use volatile stores to force the accumulator values to be written out.
// This dummy write (to a global variable) makes the work observable and
// prevents the multiplication pipeline from being optimized out.
st.global.volatile.f32 [dummy_sink_f32], accum0;
st.global.volatile.f32 [dummy_sink_f32+4], accum1;
st.global.volatile.f32 [dummy_sink_f32+8], accum2;
st.global.volatile.f32 [dummy_sink_f32+12], accum3;
ret;
}