less_slow_sm80.ptx

/**
 *  less_slow_sm80.ptx
 *
 *  Micro-kernels for building a performance-first mindset for CUDA-capable
 *  GPUs using Parallel Thread eXecution (PTX) Intermediate Representation (IR) 
 *  for for Ampere-generation Nvidia GPUs with Warp-level MMA (WMMA).
 * 
 *  ? You should start at `less_slow.cu` before reading this file.
 *  ? You should start at `less_slow_sm70.ptx` before reading this file.
 *  ? Also read intro to PTX: https://docs.nvidia.com/cuda/parallel-thread-execution/
 *  ? Check the PTX ISA: https://docs.nvidia.com/cuda/pdf/ptx_isa_8.5.pdf
 *
 *  You can validate this file by asking the Nvidia PTX Assembler to compile it
 *  to `.cubin` for some target architecture:
 * 
 *  $ ptxas -o less_slow_sm80_from_ptx.cubin -arch=sm_80 less_slow_sm80.ptx
 *  $ cuobjdump -sass less_slow_sm80_from_ptx.cubin | grep -i mma
 *
 *  Assuming how aggressively NVCC unrolls loops and the number of kernels in
 *  this file, you may want to deduplicate them:
 *
 *  $ cuobjdump -sass less_slow_sm80_from_ptx.cubin | grep -i mma | \
 *  $   sed -r 's/\/\*[^*]+\*\///g' | \
 *  $   sed -r 's/^[[:space:]]+//; s/[[:space:]]+$//' | \
 *  $   sort -u
 */
.version 7.0             // PTX version 7.0 for Ampere GPUs
.target sm_80            // Target architecture (SM_80 - Ampere GPUs)
.address_size 64         // 64-bit addressing

/**
 *  Let's define some global memory buffers, visible on both device and host
 *  side, to output multiplication results.
 */
.visible .global .align 8 .f64 dummy_sink_f64[32];
.visible .global .align 4 .s32 dummy_sink_s32[32];
.visible .global .align 4 .f32 dummy_sink_f32[32];

/**
 *  Our previous Volta kernel should work just fine here, but we can make it
 *  nicer by using the `<>` syntax to define many virtual registers without
 *  explicitly naming them! We can also explicitly define them as `.f16x2` to
 *  constrain the registers to packed half-precision pairs.
 *
 *  We can also scale from a Quadpair-level MMA to the Warp-level WMMA,
 *  synchronizing more threads to process larger tiles, as the PTX docs
 *  explicitly warn against using the `mma.sync.m8n8k4` to avoid performance
 *  issues!
 */
.visible .entry tops_f16f16_sm80wmma_16x16x16_loop128_ptx_kernel()
{
    // Accumulator registers used for both input and output of the MMA operation
    // https://docs.nvidia.com/cuda/parallel-thread-execution/#parameterized-variable-names
    .reg .b32 accum<4>;

    // Registers to hold packed 16-bit data for matrix A (8 registers)
    .reg .f16x2 matrix_a<8>;

    // Registers to hold packed 16-bit data for matrix B (8 registers)
    .reg .f16x2 matrix_b<8>;

    // General-purpose registers for loop control and constant values
    .reg .b32 loop_counter, loop_limit, packed_const;

    // Predicate register for conditional branching (loop exit)
    .reg .pred exit_predicate;

    // Set up loop counter and loop limit
    mov.u32 loop_counter, 0;
    mov.u32 loop_limit, 128;

    // Zero-initialize the accumulators, as registers may contain noise
    // https://docs.nvidia.com/cuda/parallel-thread-execution/#state-spaces
    mov.f32 accum0, 0.0;
    mov.f32 accum1, 0.0;
    mov.f32 accum2, 0.0;
    mov.f32 accum3, 0.0;

    // Initialize constant for packed matrix data (placeholder)
    mov.b32 packed_const, 0x00010001;

    // Initialize matrix a registers with the packed constant
    mov.b32 matrix_a0, packed_const;
    mov.b32 matrix_a1, packed_const;
    mov.b32 matrix_a2, packed_const;
    mov.b32 matrix_a3, packed_const;
    mov.b32 matrix_a4, packed_const;
    mov.b32 matrix_a5, packed_const;
    mov.b32 matrix_a6, packed_const;
    mov.b32 matrix_a7, packed_const;

    // Initialize matrix b registers with the packed constant
    mov.b32 matrix_b0, packed_const;
    mov.b32 matrix_b1, packed_const;
    mov.b32 matrix_b2, packed_const;
    mov.b32 matrix_b3, packed_const;
    mov.b32 matrix_b4, packed_const;
    mov.b32 matrix_b5, packed_const;
    mov.b32 matrix_b6, packed_const;
    mov.b32 matrix_b7, packed_const;

    // The main loop will repeat for 128 iterations
loop_start:
    setp.ge.u32 exit_predicate, loop_counter, loop_limit;
    @exit_predicate bra loop_exit;

    wmma.mma.sync.aligned.row.col.m16n16k16.f16.f16 
         { accum0, accum1, accum2, accum3 },
         { matrix_a0, matrix_a1, matrix_a2, matrix_a3,
           matrix_a4, matrix_a5, matrix_a6, matrix_a7 },
         { matrix_b0, matrix_b1, matrix_b2, matrix_b3,
           matrix_b4, matrix_b5, matrix_b6, matrix_b7 },
         { accum0, accum1, accum2, accum3 };

    // Increment the loop counter
    add.u32 loop_counter, loop_counter, 1;

    // Branch back to the beginning of the loop
    bra loop_start;

loop_exit:
    // This barrier forces all asynchronous warp-group operations to complete.
    bar.sync 0;

    // Use volatile stores to force the accumulator values to be written out.
    // This dummy write (to a global variable) makes the work observable and 
    // prevents the multiplication  pipeline from being optimized out.
    st.global.volatile.f32 [dummy_sink_f32],      accum0;
    st.global.volatile.f32 [dummy_sink_f32+4],    accum1;
    st.global.volatile.f32 [dummy_sink_f32+8],    accum2;
    st.global.volatile.f32 [dummy_sink_f32+12],   accum3;
    ret;
}

.visible .entry tops_f16f32_sm80wmma_16x16x16_loop128_ptx_kernel()
{
    // Accumulator registers used for both input and output of the MMA operation
    // https://docs.nvidia.com/cuda/parallel-thread-execution/#parameterized-variable-names
    .reg .b32 accum<8>;

    // Registers to hold packed 16-bit data for matrix A (8 registers)
    .reg .f16x2 matrix_a<8>;

    // Registers to hold packed 16-bit data for matrix B (8 registers)
    .reg .f16x2 matrix_b<8>;

    // General-purpose registers for loop control and constant values
    .reg .b32 loop_counter, loop_limit, packed_const;

    // Predicate register for conditional branching (loop exit)
    .reg .pred exit_predicate;

    // Set up loop counter and loop limit
    mov.u32 loop_counter, 0;
    mov.u32 loop_limit, 128;

    // Zero-initialize the accumulators, as registers may contain noise
    // https://docs.nvidia.com/cuda/parallel-thread-execution/#state-spaces
    mov.f32 accum0, 0.0;
    mov.f32 accum1, 0.0;
    mov.f32 accum2, 0.0;
    mov.f32 accum3, 0.0;

    // Initialize constant for packed matrix data (placeholder)
    mov.b32 packed_const, 0x00010001;

    // Initialize matrix a registers with the packed constant
    mov.b32 matrix_a0, packed_const;
    mov.b32 matrix_a1, packed_const;
    mov.b32 matrix_a2, packed_const;
    mov.b32 matrix_a3, packed_const;
    mov.b32 matrix_a4, packed_const;
    mov.b32 matrix_a5, packed_const;
    mov.b32 matrix_a6, packed_const;
    mov.b32 matrix_a7, packed_const;

    // Initialize matrix b registers with the packed constant
    mov.b32 matrix_b0, packed_const;
    mov.b32 matrix_b1, packed_const;
    mov.b32 matrix_b2, packed_const;
    mov.b32 matrix_b3, packed_const;
    mov.b32 matrix_b4, packed_const;
    mov.b32 matrix_b5, packed_const;
    mov.b32 matrix_b6, packed_const;
    mov.b32 matrix_b7, packed_const;

    // The main loop will repeat for 128 iterations
loop_start:
    setp.ge.u32 exit_predicate, loop_counter, loop_limit;
    @exit_predicate bra loop_exit;

    wmma.mma.sync.aligned.row.col.m16n16k16.f32.f32
         { accum0, accum1, accum2, accum3,
           accum4, accum5, accum6, accum7 },
         { matrix_a0, matrix_a1, matrix_a2, matrix_a3,
           matrix_a4, matrix_a5, matrix_a6, matrix_a7 },
         { matrix_b0, matrix_b1, matrix_b2, matrix_b3,
           matrix_b4, matrix_b5, matrix_b6, matrix_b7 },
         { accum0, accum1, accum2, accum3,
           accum4, accum5, accum6, accum7 };

    // Increment the loop counter
    add.u32 loop_counter, loop_counter, 1;

    // Branch back to the beginning of the loop
    bra loop_start;

loop_exit:
    // This barrier forces all asynchronous warp-group operations to complete.
    bar.sync 0;

    // Use volatile stores to force the accumulator values to be written out.
    // This dummy write (to a global variable) makes the work observable and 
    // prevents the multiplication  pipeline from being optimized out.
    st.global.volatile.f32 [dummy_sink_f32],      accum0;
    st.global.volatile.f32 [dummy_sink_f32+4],    accum1;
    st.global.volatile.f32 [dummy_sink_f32+8],    accum2;
    st.global.volatile.f32 [dummy_sink_f32+12],   accum3;
    ret;
}

/**
 *  Each new generation of Tensor Cores supports a wider palette of numeric
 *  types, "structured sparsity" modes, and asynchronous scheduling protocols.
 *
 *  ! For double-precision numbers, the smallest granularity is 8x8x4.
 *  ! Technically, it requires SM 8.0, but it's not a Warp-level MMA operation.
 *  ! It's Quadpair-level MMA operation!
 */

.visible .entry tops_f64f64_sm80mma_8x8x4_loop128_ptx_kernel()
{
    // Registers to hold matrix A and B operands (each a single f64)
    .reg .f64 matrix_a, matrix_b;

    // Additive bias and accumulator registers used for both input and output of the MMA operation
    .reg .f64 bias<2>, accum<2>;

    // General-purpose registers for loop control
    .reg .b32 loop_counter, loop_limit;

    // Predicate register for conditional branching (loop exit)
    .reg .pred exit_predicate;

    // Set up loop counter and loop limit
    mov.u32 loop_counter, 0;
    mov.u32 loop_limit, 128;

    // Zero-initialize the accumulator registers
    mov.f64 accum0, 0.0;
    mov.f64 accum1, 0.0;

    // Initialize matrix A and B operands to one (double precision)
    mov.f64 matrix_a, 1.0;
    mov.f64 matrix_b, 1.0;

    // Initialize the additive bias to zero
    mov.f64 bias0, 0.0;
    mov.f64 bias1, 0.0;

    // The main loop will repeat for 128 iterations
loop_start:
    setp.ge.u32 exit_predicate, loop_counter, loop_limit;
    @exit_predicate bra loop_exit;

    mma.sync.aligned.m8n8k4.row.col.f64.f64.f64.f64
         { accum0, accum1 },
         { matrix_a },
         { matrix_b },
         { bias0, bias1 };

    // Increment the loop counter
    add.u32 loop_counter, loop_counter, 1;

    // Branch back to the beginning of the loop
    bra loop_start;

loop_exit:
    // This barrier forces all asynchronous warp-group operations to complete.
    bar.sync 0;

    // Use volatile stores to force the accumulator values to be written out.
    // This dummy write (to a global variable) makes the work observable and 
    // prevents the multiplication  pipeline from being optimized out.
    st.global.volatile.f64 [dummy_sink_f64],       accum0;
    st.global.volatile.f64 [dummy_sink_f64+8],     accum1;
    ret;
}

/**
 *  Ridiculously, there are no Warp-Group-Asynchronous double-precision
 *  variants, but every other variant is present:
 *
 *  - `b1` operands wit `s32` accumulators
 *  - `u8` and `s8` operands with `s32` accumulators
 *  - `e4m3` and `e5m2` operands with `f16` and `f32` accumulators
 *  - `f16` operands with `f16` and `f32` accumulators
 *  - `bf16` operands with `f32` accumulators
 *  - `tf32` operands with `f32` accumulators
 *
 *  So the highest-precision properly accelerated type is the `tf32`, which
 *  is confusingly 19 bits wide! The synchronous variant would look familiar:
 */

 .visible .entry tops_tf32f32_sm80wmma_16x16x8_loop128_ptx_kernel()
{
    // Accumulator registers used for both input and output of the MMA operation
    .reg .b32 accum<8>;

    // Registers to hold packed 16-bit data for matrix A
    .reg .b32 matrix_a<4>;

    // Registers to hold packed 16-bit data for matrix B
    .reg .b32 matrix_b<4>;

    // General-purpose registers for loop control and constant values
    .reg .b32 loop_counter, loop_limit, packed_const;

    // Predicate register for conditional branching (loop exit)
    .reg .pred exit_predicate;

    // Set up loop counter and loop limit
    mov.u32 loop_counter, 0;
    mov.u32 loop_limit, 128;

    // Zero-initialize the accumulator registers
    mov.f32 accum0, 0.0;
    mov.f32 accum1, 0.0;
    mov.f32 accum2, 0.0;
    mov.f32 accum3, 0.0;
    mov.f32 accum4, 0.0;
    mov.f32 accum5, 0.0;
    mov.f32 accum6, 0.0;
    mov.f32 accum7, 0.0;

    // Initialize constant for packed matrix data (placeholder)
    mov.b32 packed_const, 0x00010001;

    // Initialize matrix a registers with the packed constant
    mov.b32 matrix_a0, packed_const;
    mov.b32 matrix_a1, packed_const;

    // Initialize matrix b registers with the packed constant
    mov.b32 matrix_b0, packed_const;
    mov.b32 matrix_b1, packed_const;

    // The main loop will repeat for 128 iterations
loop_start:
    setp.ge.u32 exit_predicate, loop_counter, loop_limit;
    @exit_predicate bra loop_exit;

    wmma.mma.sync.aligned.m16n16k8.row.col.f32.tf32.tf32.f32
         { accum0, accum1, accum2, accum3,
           accum4, accum5, accum6, accum7 },
         { matrix_a0, matrix_a1, matrix_a2, matrix_a3 },
         { matrix_b0, matrix_b1, matrix_b2, matrix_b3 },
         { accum0, accum1, accum2, accum3,
           accum4, accum5, accum6, accum7 };

    // Increment the loop counter
    add.u32 loop_counter, loop_counter, 1;

    // Branch back to the beginning of the loop
    bra loop_start;

loop_exit:
    // This barrier forces all asynchronous warp-group operations to complete.
    bar.sync 0;

    // Use volatile stores to force the accumulator values to be written out.
    // This dummy write (to a global variable) makes the work observable and 
    // prevents the multiplication  pipeline from being optimized out.
    st.global.volatile.f32 [dummy_sink_f32],      accum0;
    st.global.volatile.f32 [dummy_sink_f32+4],    accum1;
    st.global.volatile.f32 [dummy_sink_f32+8],    accum2;
    st.global.volatile.f32 [dummy_sink_f32+12],   accum3;
    ret;
}