/**
 *  less_slow_sm70.ptx
 *
 *  Micro-kernels for building a performance-first mindset for CUDA-capable
 *  GPUs using Parallel Thread eXecution (PTX) Intermediate Representation (IR) 
 *  for different generations of Streaming Multiprocessors (SMs) and Tensor
 *  Cores (TCs) for Volta-generation Nvidia GPUs.
 * 
 *  ? You should start at `less_slow.cu` before reading this file.
 *  ? Also read intro to PTX: https://docs.nvidia.com/cuda/parallel-thread-execution/
 *  ? Check the PTX ISA: https://docs.nvidia.com/cuda/pdf/ptx_isa_8.5.pdf
 *  
 *  ! PTX is higher-level than SASS, but still very similar to typical CPU 
 *  ! assembly languages. It has slightly different syntax and semantics for
 *  ! predicates and memory access, but still don't have `for` loops :(
 *  ! As for emojis, we can only use ASCII... CUDA can't JIT-compile UTF-8.
 * 
 *  You can validate this file by asking the Nvidia PTX Assembler to compile it
 *  to `.cubin` for some target architecture:
 * 
 *  $ ptxas -o less_slow_sm70_from_ptx.cubin -arch=sm_70 less_slow_sm70.ptx
 *  $ cuobjdump -sass less_slow_sm70_from_ptx.cubin | grep -i mma
 *
 *  Assuming how aggressively NVCC unrolls loops and the number of kernels in
 *  this file, you may want to deduplicate them:
 *
 *  $ cuobjdump -sass less_slow_sm70_from_ptx.cubin | grep -i mma | \
 *  $   sed -r 's/\/\*[^*]+\*\///g' | \
 *  $   sed -r 's/^[[:space:]]+//; s/[[:space:]]+$//' | \
 *  $   sort -u
 * 
 *  @section Register File
 * 
 *  GPU code manages registers differently. Hopper tuning guide suggests that
 *  the register file size is 64K 32-bit registers per Streaming Multiprocessor.
 *  The maximum number of registers per thread is 255.
 * 
 *  PTX provides read-only variables visible a special registers like `%tid` for
 *  thread ID, `%ctaid` for block ID, and `%aggr_smem_size` for shared memory,
 *  or `%current_graph_exec` to access the ID of the current graph execution.
 *  To read from them, simple use the `mov` instruction.
 */

.version 6.5             // PTX version 6.5 is enough for Volta GPUs
.target sm_70            // Target architecture (SM 7.0 - Volta GPUs)
.address_size 64         // 64-bit addressing

.visible .entry tops_f16f16_sm70mma_8x8x4_loop128_ptx_kernel()
{
    // Accumulator registers used for both input and output of the MMA operation
    .reg .b32 accum_0, accum_1, accum_2, accum_3;

    // Registers to hold packed pairs of 16-bit data for matrix a (2 registers)
    .reg .b32 matrix_a_0, matrix_a_1;

    // Registers to hold packed pairs of 16-bit data for matrix b (2 registers)
    .reg .b32 matrix_b_0, matrix_b_1;

    // General-purpose registers for loop control and constant values
    .reg .b32 loop_counter, loop_limit, packed_const;

    // Predicate register for conditional branching (loop exit)
    .reg .pred exit_predicate;

    // Set up loop counter and loop limit
    mov.u32 loop_counter, 0;
    mov.u32 loop_limit, 128;

    // Zero-initialize the accumulator registers
    mov.f32 accum_0, 0.0;
    mov.f32 accum_1, 0.0;
    mov.f32 accum_2, 0.0;
    mov.f32 accum_3, 0.0;

    // Initialize constant for packed matrix data (placeholder)
    mov.b32 packed_const, 0x00010001;

    // Initialize matrix a registers with the packed constant
    mov.b32 matrix_a_0, packed_const;
    mov.b32 matrix_a_1, packed_const;

    // Initialize matrix b registers with the packed constant
    mov.b32 matrix_b_0, packed_const;
    mov.b32 matrix_b_1, packed_const;

    // The main loop will repeat for 128 iterations
loop_start:
    setp.ge.u32 exit_predicate, loop_counter, loop_limit;
    @exit_predicate bra loop_end;

    mma.sync.aligned.m8n8k4.row.col.f16.f16.f16.f16 
         { accum_0, accum_1, accum_2, accum_3 },
         { matrix_a_0, matrix_a_1 },
         { matrix_b_0, matrix_b_1 },
         { accum_0, accum_1, accum_2, accum_3 };

    // Increment the loop counter
    add.u32 loop_counter, loop_counter, 1;

    // Branch back to the beginning of the loop
    bra loop_start;

loop_end:
    // If we simply exit, the computation will be optimized out!
    // Instead, let's check for an impossible condition, like if the thread ID
    // is equal to `UINT_MAX`, and if so - write accumulators to global memory
    // NULL address.
    .reg .u32 tid;
    .reg .pred impossible_predicate;
    mov.u32 tid, %tid.x; //? Special system registers start with `%`
    setp.ne.u32 impossible_predicate, tid, 0xFFFFFFFF;
    @impossible_predicate bra loop_exit;

    // Write into memory:
    .reg .u64 store_ptr;
    mov.u64 store_ptr, 0;
    st.global.f32 [store_ptr],      accum_0;
    st.global.f32 [store_ptr+4],    accum_1;
    st.global.f32 [store_ptr+8],    accum_2;
    st.global.f32 [store_ptr+12],   accum_3;

loop_exit:
    ret;
}

.visible .entry tops_f16f32_sm70mma_8x8x4_loop128_ptx_kernel()
{
    // Accumulator registers used for both input and output of the MMA operation
    .reg .b32 accum_0, accum_1, accum_2, accum_3,
              accum_4, accum_5, accum_6, accum_7;

    // Registers to hold packed 16-bit data for matrix a (4 registers)
    .reg .b32 matrix_a_0, matrix_a_1, matrix_a_2, matrix_a_3;

    // Registers to hold packed 16-bit data for matrix b (4 registers)
    .reg .b32 matrix_b_0, matrix_b_1, matrix_b_2, matrix_b_3;

    // General-purpose registers for loop control and constant values
    .reg .b32 loop_counter, loop_limit, packed_const;

    // Predicate register for conditional branching (loop exit)
    .reg .pred exit_predicate;

    // Set up loop counter and loop limit
    mov.u32 loop_counter, 0;
    mov.u32 loop_limit, 128;

    // Zero-initialize the accumulator registers
    mov.f32 accum_0, 0.0;
    mov.f32 accum_1, 0.0;
    mov.f32 accum_2, 0.0;
    mov.f32 accum_3, 0.0;

    // Initialize constant for packed matrix data (placeholder)
    mov.b32 packed_const, 0x00010001;

    // Initialize matrix a registers with the packed constant
    mov.b32 matrix_a_0, packed_const;
    mov.b32 matrix_a_1, packed_const;
    mov.b32 matrix_a_2, packed_const;
    mov.b32 matrix_a_3, packed_const;

    // Initialize matrix b registers with the packed constant
    mov.b32 matrix_b_0, packed_const;
    mov.b32 matrix_b_1, packed_const;
    mov.b32 matrix_b_2, packed_const;
    mov.b32 matrix_b_3, packed_const;

    // The main loop will repeat for 128 iterations
loop_start:
    setp.ge.u32 exit_predicate, loop_counter, loop_limit;
    @exit_predicate bra loop_end;

    mma.sync.aligned.m8n8k4.row.col.f32.f16.f16.f32
         { accum_0, accum_1, accum_2, accum_3,
           accum_4, accum_5, accum_6, accum_7 },
         { matrix_a_0, matrix_a_1 },
         { matrix_b_0, matrix_b_1 },
         { accum_0, accum_1, accum_2, accum_3,
           accum_4, accum_5, accum_6, accum_7 };

    // Increment the loop counter
    add.u32 loop_counter, loop_counter, 1;

    // Branch back to the beginning of the loop
    bra loop_start;

loop_end:
    // If we simply exit, the computation will be optimized out!
    // Instead, let's check for an impossible condition, like if the thread ID
    // is equal to `UINT_MAX`, and if so - write accumulators to global memory
    // NULL address.
    .reg .u32 tid;
    .reg .pred impossible_predicate;
    mov.u32 tid, %tid.x; //? Special system registers start with `%`
    setp.ne.u32 impossible_predicate, tid, 0xFFFFFFFF;
    @impossible_predicate bra loop_exit;

    // Write into memory:
    .reg .u64 store_ptr;
    mov.u64 store_ptr, 0;
    st.global.f32 [store_ptr],      accum_0;
    st.global.f32 [store_ptr+4],    accum_1;
    st.global.f32 [store_ptr+8],    accum_2;
    st.global.f32 [store_ptr+12],   accum_3;

loop_exit:
    ret;
}

/**
 *  Here are some potentially counterintuitive facts about PTX and SASS:
 *
 *  - NVCC unrolls loops more aggressively than any other mainstream compiler.
 *
 *  - If you are coming from CPU side, you shouldn't expect instructions to be
 *    forward-compatible or to have better or equal performance on the next
 *    generation! Entire instruction families may live for just one generation
 *    and be completely abandoned in a couple of years.
 *
 *  - Some instructions, like Tensor Core Gen 4 and 5 operations can't work
 *    with both multiplication operands in GPU registers. At least one of them
 *    has to be in the shared memory. Moreover, they may work up to 10% faster
 *    with both arguments in shared memory!
 *
 *  Because only one `.version` directive can be placed in each file, for newer
 *  kernels, go to `less_slow_sm80.ptx` for Ampere and `less_slow_sm90a.ptx`
 *  for Hopper.
 *
 *  @see PTX module-level directives:
 *  https://docs.nvidia.com/cuda/parallel-thread-execution/#ptx-module-directives
 */