CUDA Exercises
CUDA Exercises
These exercises will have you write some basic CUDA applications. You
will learn how to allocate GPU memory, move data between the host and
the GPU, and launch kernels.
Your first task is to create a simple hello world application in CUDA. The
code skeleton is already given to you in `hello.cu`. Edit that file, paying
attention to the FIXME locations, so that the output when run is like this:
```
Hello from block: 0, thread: 0
Hello from block: 0, thread: 1
Hello from block: 1, thread: 0
Hello from block: 1, thread: 1
```
(the ordering of the above lines may vary; ordering differences do not
indicate an incorrect result)
```
module load cuda
nvcc -o hello hello.cu
```
The module load command selects a CUDA compiler for your use. The
module load command only needs to be done once per session/login.
`nvcc` is the CUDA compiler invocation command. The syntax is generally
similar to gcc/g++.
```
bsub -W 10 -nnodes 1 -P <allocation_ID> -Is jsrun -n1 -a1 -c1 -g1 ./hello
```
Alternatively, you may want to create an alias for your `bsub` command in
order to make subsequent runs easier:
```
alias lsfrun='bsub -W 10 -nnodes 1 -P <allocation_ID> -Is jsrun -n1 -a1 -c1
-g1'
lsfrun ./hello
```
```
module load esslurm
srun -C gpu -N 1 -n 1 -t 10 -A m3502 --gres=gpu:1 -c 10 ./hello
```
If you prefer, you can instead reserve a GPU in an interactive session, and
then run an executable any number of times while the Slurm allocation is
active:
```
salloc -C gpu -N 1 -t 60 -A m3502 --gres=gpu:1 -c 10
srun -n 1 ./hello
```
Note that you only need to `module load esslurm` once per login session;
this is what enables you to submit to the Cori GPU nodes.
If you're up for a challenge, see if you can write a complete vector add
program from scratch. Or if you prefer, there is a skeleton code given to
you in `vector_add.cu`. Edit the code to build a complete vector_add
program. Compile it and run it similar to the method given in exercise 1.
You can refer to `vector_add_solution.cu` for a complete example.
Note that this skeleton code includes something we didn't cover in lesson 1:
CUDA error checking. Every CUDA runtime API call returns an error code.
It's good practice (especially if you're having trouble) to rigorously check
these error codes. A macro is given that will make this job easier. Note the
special error checking method after a kernel call.
hello:
#include <stdio.h>
int main(){
hello<<<2,2>>>();
cudaDeviceSynchronize();
}
matrix_multiplication(naive):
#include <stdio.h>
int main(){
// start timing
t0 = clock();
// Initialization timing
t1 = clock();
t1sum = ((double)(t1-t0))/CLOCKS_PER_SEC;
printf("Init took %f seconds. Begin compute\n", t1sum);
// GPU timing
t2 = clock();
t2sum = ((double)(t2-t1))/CLOCKS_PER_SEC;
printf ("Done. Compute took %f seconds\n", t2sum);
// Verify results
cudaCheckErrors("kernel execution failure or cudaMemcpy H2D failure");
for (int i = 0; i < DSIZE*DSIZE; i++) if (h_C[i] != A_val*B_val*DSIZE)
{printf("mismatch at index %d, was: %f, should be: %f\n", i, h_C[i],
A_val*B_val*DSIZE); return -1;}
printf("Success!\n");
return 0;
}
vector_add:
#include <stdio.h>
int main(){
# Homework 2
These exercises will help reinforce the concept of Shared Memory on the
GPU.
```
module load cuda
nvcc -o stencil_1d stencil_1d.cu
```
The module load command selects a CUDA compiler for your use. The
module load command only needs to be done once per session/login.
*nvcc* is the CUDA compiler invocation command. The syntax is generally
similar to gcc/g++.
```
bsub -W 10 -nnodes 1 -P <allocation_ID> -Is jsrun -n1 -a1 -c1 -g1
./stencil_1d
```
Alternatively, you may want to create an alias for your *bsub* command in
order to make subsequent runs easier:
```
alias lsfrun='bsub -W 10 -nnodes 1 -P <allocation_ID> -Is jsrun -n1 -a1 -c1
-g1'
lsfrun ./stencil_1d
```
```
module load esslurm
srun -C gpu -N 1 -n 1 -t 10 -A m3502 --reservation cuda_training
--gres=gpu:1 -c 10 ./stencil_1d
```
```
salloc -C gpu -N 1 -t 60 -A m3502 --reservation cuda_training --gres=gpu:1
-c 10
srun -n 1 ./stencil_1d
```
Note that you only need to `module load esslurm` once per login session;
this is what enables you to submit to the Cori GPU nodes.
```
module load cuda
nvcc -o matrix_mul matrix_mul_shared.cu
lsfrun ./matrix_mul
```
Note that timing information is included. Go back and run your solution from
Homework 1 and observe the runtime. What runtime impact do you notice
after applying shared memory to this 2D matrix multiply? How does it differ
from the runtime you observed in your previous implementation?
If you have trouble, you can look at *matrix_mul_shared_solution* for a
complete example.
matrix_mul_shared
#include <stdio.h>
// Synchronize
__syncthreads();
int main(){
// start timing
t0 = clock();
// Initialization timing
t1 = clock();
t1sum = ((double)(t1-t0))/CLOCKS_PER_SEC;
printf("Init took %f seconds. Begin compute\n", t1sum);
// GPU timing
t2 = clock();
t2sum = ((double)(t2-t1))/CLOCKS_PER_SEC;
printf ("Done. Compute took %f seconds\n", t2sum);
// Verify results
cudaCheckErrors("kernel execution failure or cudaMemcpy H2D failure");
for (int i = 0; i < DSIZE*DSIZE; i++) if (h_C[i] != A_val*B_val*DSIZE)
{printf("mismatch at index %d, was: %f, should be: %f\n", i, h_C[i],
A_val*B_val*DSIZE); return -1;}
printf("Success!\n");
return 0;
}
1d_stencil:
#include <stdio.h>
#include <algorithm>
#define N 4096
#define RADIUS 3
#define BLOCK_SIZE 16
int main(void) {
int *in, *out; // host copies of a, b, c
int *d_in, *d_out; // device copies of a, b, c
int size = (N + 2*RADIUS) * sizeof(int);
// Alloc space for host copies and setup values
in = (int *)malloc(size); fill_ints(in, N + 2*RADIUS);
out = (int *)malloc(size); fill_ints(out, N + 2*RADIUS);
// Copy to device
cudaMemcpy(d_in, in, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_out, out, size, cudaMemcpyHostToDevice);
// Error Checking
for (int i = 0; i < N + 2*RADIUS; i++) {
if (i<RADIUS || i>=N+RADIUS){
if (out[i] != 1)
printf("Mismatch at index %d, was: %d, should be: %d\n", i, out[i], 1);
} else {
if (out[i] != 1 + 2*RADIUS)
printf("Mismatch at index %d, was: %d, should be: %d\n", i, out[i], 1 +
2*RADIUS);
}
}
// Cleanup
free(in); free(out);
cudaFree(d_in); cudaFree(d_out);
printf("Success!\n");
return 0;
}
We'll use a slight variation on the vector add code presented in a previous
homework (*vector_add.cu*). Edit the code to build a complete vector_add
program. You can refer to *vector_add_solution.cu* for a complete
example. For this example, we have made a change to the kernel to use
something called a grid-stride loop. This topic will be dealt with in more
detail in a later training session, but for now we can describe it as a flexible
kernel design method that allows a simple kernel to handle an arbitrary size
data set with an arbitrary size "grid", i.e. the configuration of blocks and
threads associated with the kernel launch. If you'd like to read more about
grid-stride loops right now, you can visit
https://devblogs.nvidia.com/cuda-pro-tip-write-flexible-kernels-grid-stride-lo
ops/
Note that this skeleton code includes something we didn't cover in lesson 1:
CUDA error checking. Every CUDA runtime API call returns an error code.
It's good practice (especially if you're having trouble) to rigorously check
these error codes. A macro is given that will make this job easier. Note the
special error checking method after a kernel call.
```
module load cuda
nvcc -o vector_add vector_add.cu
```
The module load command selects a CUDA compiler for your use. The
module load command only needs to be done once per session/login.
*nvcc* is the CUDA compiler invocation command. The syntax is generally
similar to gcc/g++.
```
bsub -W 10 -nnodes 1 -P <allocation_ID> -Is jsrun -n1 -a1 -c1 -g1
./vector_add
```
Alternatively, you may want to create an alias for your bsub command in
order to make subsequent runs easier:
```
alias lsfrun='bsub -W 10 -nnodes 1 -P <allocation_ID> -Is jsrun -n1 -a1 -c1
-g1'
lsfrun ./vector_add
```
```
module load esslurm
srun -C gpu -N 1 -n 1 -t 10 -A m3502 --reservation cuda_training
--gres=gpu:1 -c 10 ./vector_add
```
If you prefer, you can instead reserve a GPU in an interactive session, and
then run an executable any number of times while the Slurm allocation is
active (this is recommended if there are enough available nodes):
```
salloc -C gpu -N 1 -t 60 -A m3502 --reservation cuda_training --gres=gpu:1
-c 10
srun -n 1 ./vector_add
```
Note that you only need to `module load esslurm` once per login session;
this is what enables you to submit to the Cori GPU nodes.
We've also changed the problem size from the previous example, so
correct output should look like this:
```
A[0] = 0.120663
B[0] = 0.615704
C[0] = 0.736367
```
the actual numerical values aren't too important, as long as C[0] = A[0] +
B[0]
Our objective now will be to explore some of the concepts we learned in the
lesson. In particular we want to see what effect grid sizing (choice of
blocks, and threads per block) have on performance. We could do analysis
like this using host-code-based timing methods, but we'll introduce a new
concept, using a GPU profiler. In a future session, you'll learn more about
the GPU profilers (Nsight Compute and Nsight Systems), but for now we
will use Nsight Compute in a fairly simple fashion to get some basic data
about kernel behavior, to use for comparison.
(If you'd like to read more about the Nsight profilers, you can start here:
https://devblogs.nvidia.com/migrating-nvidia-nsight-tools-nvvp-nvprof/)
First, note that the code has these two lines in it:
```
int blocks = 1; // modify this line for experimentation
int threads = 1; // modify this line for experimentation
```
These lines control the grid sizing. The first variable blocks chooses the
total number of blocks to launch. The second variable threads chooses the
number of threads per block to launch. This second variable must be
constrained to choices between 1 and 1024, inclusive. These are limits
imposed by the GPU hardware.
Let's consider 3 cases. In each case, we will modify the blocks and threads
variables, recompile the code, and then run the code under the Nsight
Compute profiler.
For the following profiler experiments, we will assume you have loaded the
profile module and acquired a node for interactive usage:
```
module load nsight-compute
bsub -W 30 -nnodes 1 -P <allocation_ID> -Is /bin/bash
```
For this experiment, leave the code as you have created it to complete
exercise 1 above. When running the code you may have noticed it takes a
few seconds to run, however the duration is not particularly long. This
raises the question "how much of that time is the kernel running?" The
profiler can help us answer that question, and we can use this duration (or
various other characteristics) as indicators of "performance" for
comparison. The kernel is designed to do the same set of arithmetic
calculations regardless of the grid sizing choices, so we can say that
shorter kernel duration corresponds to higher performance.
If you'd like to get a basic idea of "typical" profiler output, you could use the
following command:
```
jsrun -n1 -a1 -c1 -g1 nv-nsight-cu-cli ./vector_add
```
However for this 1 block/1 thread test case, the profiler will spend several
minutes assembling the requested set of information. Since our focus is on
kernel duration, we can use a command that allows the profiler to run more
quickly:
```
jsrun -n1 -a1 -c1 -g1 nv-nsight-cu-cli --section SpeedOfLight --section
MemoryWorkloadAnalysis ./vector_add
```
This will allow the profiler to complete its work in under a minute.
We won't parse all the output, but we're interested in these lines:
```
Duration second
2.86
```
and:
```
Memory Throughput Mbyte/second
204.25
```
The above indicate that our kernel took about 3 seconds to run and
achieved around 200MB/s "throughput" i.e. combined read and write
activity, to the GPU memory. A Tesla V100 has around 700-900 GB/s of
available memory throughput, so this code isn't using the available memory
bandwidth very well, amongst other issues. Can we improve the situation
with some changes to our grid sizing?
So let's take a baby step with our code. Let's change from 1 block of 1
thread to 1 block of 1024 threads. As we've learned, this structure isn't very
good, because it can use at most a single SM on our GPU, but can it
improve performance at all?
Edit the code to make the changes to the threads (1024) variable only.
Leave the blocks variable at 1. Recompile the code and then rerun the
same profiler command. What are the kernel duration and (achieved)
memory throughput now?
(You should now observe a kernel duration that drops from the second
range to the millisecond range, and the memory throughput should now be
in the GB/s instead of MB/s)
Let's fill the GPU now. We learned that a Tesla V100 has 80 SMs, and
each SM can handle at most 2048 threads. If we create a grid of 160
blocks, each of 1024 threads, this should allow for maximum "occupancy"
of our kernel/grid on the GPU. Make the necessary changes to the blocks
(= 160) variable (the threads variable should already be at 1024 from step
2b), recompile the code, and rerun the profiler command as given in 2a.
What is the performance (kernel duration) and achieved memory
throughput now?
(You should now observe a kernel duration that has dropped to the
microsecond range - ~500us - and a memory throughput that should be
"close" to the peak theoretical of 900GB/s for a Tesla V100).
For the Tesla V100 GPU, this calculation of 80 SMs * 2048 threads/SM =
164K threads is our definition of "lots of threads".
vector_Add:
#include <stdio.h>
int main(){
Your first task is to create a simple matrix row and column sum application
in CUDA. The code skeleton is already given to you in *matrix_sums.cu*.
Edit that file, paying attention to the FIXME locations, so that the output
when run is like this:
```
row sums correct!
column sums correct!
```
After editing the code, compile it using the following:
```
module load cuda
nvcc -o matrix_sums matrix_sums.cu
```
The module load command selects a CUDA compiler for your use. The
module load command only needs to be done once per session/login.
*nvcc* is the CUDA compiler invocation command. The syntax is generally
similar to gcc/g++.
```
bsub -W 10 -nnodes 1 -P <allocation_ID> -Is jsrun -n1 -a1 -c1 -g1
./matrix_sums
```
Alternatively, you may want to create an alias for your bsub command in
order to make subsequent runs easier:
```
alias lsfrun='bsub -W 10 -nnodes 1 -P <allocation_ID> -Is jsrun -n1 -a1 -c1
-g1'
lsfrun ./matrix_sums
```
```
module load esslurm
srun -C gpu -N 1 -n 1 -t 10 -A m3502 --reservation cuda_training
--gres=gpu:1 -c 10 ./matrix_sums
```
If you prefer, you can instead reserve a GPU in an interactive session, and
then run an executable any number of times while the Slurm allocation is
active (this is recommended if there are enough available nodes):
```
salloc -C gpu -N 1 -t 60 -A m3502 --reservation cuda_training --gres=gpu:1
-c 10
srun -n 1 ./matrix_sums
```
Note that you only need to `module load esslurm` once per login session;
this is what enables you to submit to the Cori GPU nodes.
## **2. Profiling**
We'll introduce something new: the profiler (in this case, Nsight Compute).
We'll use the profiler first to time the kernel execution times, and then to
gather some "metric" information that will possibly shed light on our
observations.
It's necessary to complete task 1 first. Next, load the Nsight Compute
module:
```
module load nsight-compute
```
```
lsfrun nv-nsight-cu-cli ./matrix_sums
```
```
lsfrun nv-nsight-cu-cli --metrics
l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum,l1tex__t_requests_pipe
_lsu_mem_global_op_ld.sum ./matrix_sums
```
Our goal is to measure the global memory load efficiency of our kernels. In
this case we have asked for two metrics:
"*l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum*" (the number of
global memory load requests) and
"*l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum*" (the number of
sectors requested for global loads). This first metric above represents the
denominator (requests) of the desired measurement (transactions per
request) and the second metric represents the numerator (transactions).
Dividing these numbers will give us the number of transactions per request.
What similarities or differences do you notice between the *row_sum* and
*column_sum* kernels?
Do the kernels (*row_sum*, *column_sum*) have the same or different
efficiencies?
Why?
How does this correspond to the observed kernel execution times for the
first profiling run?
Can we improve this? (Stay tuned for the next CUDA training session.)
Here is a useful blog to help you get familiar with Nsight Compute:
https://devblogs.nvidia.com/using-nsight-compute-to-inspect-your-kernels/
Matrix_sums:
#include <stdio.h>
int main(){
float *h_A, *h_sums, *d_A, *d_sums;
h_A = new float[DSIZE*DSIZE]; // allocate space for data in host memory
h_sums = new float[DSIZE]();
for (int i = 0; i < DSIZE*DSIZE; i++) // initialize matrix in host memory
h_A[i] = 1.0f;
row_sums<<<(DSIZE+block_size-1)/block_size, block_size>>>(d_A,
d_sums, DSIZE);
cudaCheckErrors("kernel launch failure");
//cuda processing sequence step 2 is complete
cudaMemset(d_sums, 0, DSIZE*sizeof(float));
column_sums<<<(DSIZE+block_size-1)/block_size, block_size>>>(d_A,
d_sums, DSIZE);
cudaCheckErrors("kernel launch failure");
//cuda processing sequence step 2 is complete
// copy vector sums from device to host:
cudaMemcpy(h_sums, d_sums, DSIZE*sizeof(float),
cudaMemcpyDeviceToHost);
//cuda processing sequence step 3 is complete
For your first task, the code is already written for you. We will compare 3 of
the reductions given during the presentation: the naive atomic-only
reduction, the classical parallel reduction with atomic finish, and the warp
shuffle reduction (with atomic finish).
```
module load cuda
nvcc -o reductions reductions.cu
```
The module load command selects a CUDA compiler for your use. The
module load command only needs to be done once per session/login.
*nvcc* is the CUDA compiler invocation command. The syntax is generally
similar to gcc/g++. Let's also load the Nsight Compute module:
```
module load nsight-compute
```
To run your code, we will use an LSF command:
```
bsub -W 10 -nnodes 1 -P <allocation_ID> -Is jsrun -n1 -a1 -c1 -g1
nv-nsight-cu-cli ./reductions
```
Alternatively, you may want to create an alias for your *bsub* command in
order to make subsequent runs easier:
```
alias lsfrun='bsub -W 10 -nnodes 1 -P <allocation_ID> -Is jsrun -n1 -a1 -c1
-g1'
lsfrun nv-nsight-cu-cli ./reductions
```
```
module load esslurm
srun -C gpu -N 1 -n 1 -t 10 -A m3502 --reservation cuda_training
--gres=gpu:1 -c 10 ./reductions
```
If you prefer, you can instead reserve a GPU in an interactive session, and
then run an executable any number of times while the Slurm allocation is
active (this is recommended if there are enough available nodes):
```
salloc -C gpu -N 1 -t 60 -A m3502 --reservation cuda_training --gres=gpu:1
-c 10
srun -n 1 ./reductions
```
Note that you only need to `module load esslurm` once per login session;
this is what enables you to submit to the Cori GPU nodes.
This will run the code with the profiling in its most basic mode, which is
sufficient. We want to compare kernel execution times. What do you notice
about kernel execution times? Probably, you won't see much difference
between the parallel reduction with atomics and the warp shuffle with
atomics kernel. Can you theorize why this may be? Our objective with
these will be to approach theoretical limits. The theoretical limit for a typical
reduction would be determined by the memory bandwidth of the GPU. To
calculate the attained memory bandwidth of this kernel, divide the total data
size in bytes (use N from the code in your calculation) by the execution
time (which you can get from the profiler). How does this number compare
to the memory bandwidth of the GPU you are running on? (You could run
bandwidthTest sample code to get a proxy/estimate).
Now edit the code to change *N* from ~8M to 163840 (=640*256)
Recompile and re-run the code with profiling. Is there a bigger percentage
difference between the execution time of the reduce_a and reduce_ws
kernel? Why might this be?
Bonus: edit the code to change *N* from ~8M to ~32M. recompile and run.
What happened? Why?
```
nvcc -o max_reduction max_reduction.cu
lsfrun ./max_reduction
```
For this exercise, start with the *matrix_sums.cu* code from hw4. As you
may recall, the *row_sums* kernel was reading the same data set as the
*column_sums* kernel, but running noticeably slower. We now have some
ideas how to fix it. See if you can implement a reduction-per-row, to allow
the row-sum kernel to approach the performance of the column sum kernel.
There are probably several ways to tackle this problem. To see one
approach, refer to the solution.
You can start just by compiling the code as-is and running the profiler to
remind yourself of the performance (discrepancy).
```
nvcc -o matrix_sums matrix_sums.cu
lsfrun nv-nsight-cu-cli ./matrix_sums
```
Since the (given) solution may be somewhat unusual, I'll give some hints
here if needed:
After you have completed the work and are getting a successful result,
profile the code again to see if the performance of the row_sums kernel has
improved:
```
nvcc -o matrix_sums matrix_sums.cu
lsfrun nv-nsight-cu-cli ./matrix_sums
```
#include <stdio.h>
int idx = blockIdx.x; // our block index becomes our row indicator
if (idx < ds){
__shared__ float sdata[block_size];
int tid = threadIdx.x;
sdata[tid] = 0.0f;
size_t tidx = tid;
while (tidx < ds) { // block stride loop to load data
sdata[tid] += A[idx*ds+tidx];
tidx += blockDim.x;
}
Max-reduction:
#include <stdio.h>
int main(){
# Homework 6
These exercises will have you use Unified Memory to utilize GPUs on
non-trivial data structures.
For your first task, you are given a code that assembles a linked list on the
CPU, and then attempts to print an element from the list. Your task is to
modify the code using UM techniques, so that the linked list can be
correctly traversed either from CPU code or from GPU code. Hint: there is
only one line in the file that needs to be modified to do this exercise.
```
module load cuda
nvcc -o linked_list linked_list.cu
```
The module load command selects a CUDA compiler for your use. The
module load command only needs to be done once per session/login.
*nvcc* is the CUDA compiler invocation command. The syntax is generally
similar to gcc/g++.
Alternatively, you may want to create an alias for your bsub command in
order to make subsequent runs easier:
```
alias lsfrun='bsub -W 10 -nnodes 1 -P <allocation_ID> -Is jsrun -n1 -a1 -c1
-g1'
lsfrun ./linked_list
```
```
module load esslurm
srun -C gpu -N 1 -n 1 -t 10 -A m3502 --gres=gpu:1 -c 10 ./linked_list
```
If you prefer, you can instead reserve a GPU in an interactive session, and
then run an executable any number of times while the Slurm allocation is
active (this is recommended if there are enough available nodes):
```
salloc -C gpu -N 1 -t 60 -A m3502 --gres=gpu:1 -c 10
srun -n 1 ./linked_list
```
Note that you only need to `module load esslurm` once per login session;
this is what enables you to submit to the Cori GPU nodes.
```
key = 3
key = 3
```
In this exercise, you are given a code that increments a large array on the
GPU.
```
module load nsight-systems
nvcc -o array_inc array_inc.cu
lsfrun nsys profile --stats=true ./array_inc
```
b. Now, modify the code to use managed memory. Replace the malloc
operations with cudaMallocManaged, and eliminate the cudaMemcpy
operations. Do you need to replace the *cudaMemcpy* operation from
device to host with a *cudaDeviceSynchronize()*? Why? Now, compile and
profile the code again. Compare the kernel execution duration to the
previous result. Note the profiler indication of CPU and GPU page faults.
c. Now, modify the code to insert prefetching of the array to the GPU
immediately before the kernel call, and back to the CPU immediately after
the kernel call. Compile and profile the code again. Compare the kernel
execution time to the previous results. Are there still any page faults? Why?
d. Bonus: Modify the code to run the *inc()* kernel 10000 times in a row
instead of just once. What can be said about the impact of memory
operations on our runtime? What would this suggest for a real-world
application?
Aray_inc:
#include <cstdio>
#include <cstdlib>
// error checking macro
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
}\
} while (0)
cudaMallocManaged(&ptr, num_bytes);
}
__global__ void inc(int *array, size_t n){
size_t idx = threadIdx.x+blockDim.x*blockIdx.x;
while (idx < n){
array[idx]++;
idx += blockDim.x*gridDim.x; // grid-stride loop
}
}
int main(){
int *h_array;
alloc_bytes(h_array, ds*sizeof(h_array[0]));
cudaCheckErrors("cudaMallocManaged Error");
memset(h_array, 0, ds*sizeof(h_array[0]));
cudaMemPrefetchAsync(h_array, ds*sizeof(h_array[0]), 0); // add in step
2c
inc<<<256, 256>>>(h_array, ds);
cudaCheckErrors("kernel launch error");
cudaMemPrefetchAsync(h_array, ds*sizeof(h_array[0]),
cudaCpuDeviceId); // add in step 2c
cudaDeviceSynchronize();
cudaCheckErrors("kernel execution error");
for (int i = 0; i < ds; i++)
if (h_array[i] != 1) {printf("mismatch at %d, was: %d, expected: %d\n", i,
h_array[i], 1); return -1;}
printf("success!\n");
return 0;
}
Linked_list:
#include <cstdio>
#include <cstdlib>
// error checking macro
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
}\
} while (0)
struct list_elem {
int key;
list_elem *next;
};
cudaMallocManaged(&ptr, num_bytes);
}
__host__ __device__
void print_element(list_elem *list, int ele_num){
list_elem *elem = list;
for (int i = 0; i < ele_num; i++)
elem = elem->next;
printf("key = %d\n", elem->key);
}
For your first task, you are given a code that performs a silly computation
element-wise on a vector. You can initially compile, run and profile the code
if you wish.
```
module load cuda
nvcc -o overlap overlap.cu
```
The module load command selects a CUDA compiler for your use. The
module load command only needs to be done once per session/login.
*nvcc* is the CUDA compiler invocation command. The syntax is generally
similar to gcc/g++.
To run your code, we will use an LSF command:
```
bsub -W 10 -nnodes 1 -P <allocation_ID> -Is jsrun -n1 -a1 -c1 -g1 ./overlap
```
Alternatively, you may want to create an alias for your bsub command in
order to make subsequent runs easier:
```
alias lsfrun='bsub -W 10 -nnodes 1 -P <allocation_ID> -Is jsrun -n1 -a1 -c1
-g1'
lsfrun ./overlap
```
```
module load esslurm
srun -C gpu -N 1 -n 1 -t 10 -A m3502 -G 1 -c 10 ./overlap
```
If you prefer, you can instead reserve a GPU in an interactive session, and
then run an executable any number of times while the Slurm allocation is
active (this is recommended if there are enough available nodes):
```
salloc -C gpu -N 1 -t 60 -A m3502 -G 1 -c 10
srun -n 1 ./overlap
```
Note that you only need to `module load esslurm` once per login session;
this is what enables you to submit to the Cori GPU nodes.
In this case, the output will show the elapsed time of the non-overlapped
version of the code. This code copies the entire vector to the device, then
launches the processing kernel, then copies the entire vector back to the
host.
You can also run this code with Nsight Systems if you wish:
```
module load nsight-systems
lsfrun nsys profile -o <destination_dir>/overlap.qdrep ./overlap
```
Note that you will have to copy this file over to your local machine and
install Nsight Systems for visualization. You can download Nsight Systems
here:
https://developer.nvidia.com/nsight-systems
Your objective is to create a fully overlapped version of the code. Use your
knowledge of streams to create a version of the code that will issue the
work in chunks, and for each chunk perform the copy to device, kernel
launch, and copy to host in a single stream, then modifying the stream for
the next chunk. The work has been started for you in the section of code
after the #ifdef statement. Look for the FIXME tokens there, and replace
each FIXME with appropriate code to complete this task.
When you have something ready to test, compile with this additional switch:
```
nvcc -o overlap overlap.cu -DUSE_STREAMS
```
If you run the code, there will be a verification check performed, to make
sure you have processed the entire vector correctly, in chunks. If you pass
the verification test, the program will display the elapsed time of the
streamed version. You should be able to get to at least 2X faster (i.e. half
the duration) of the non-streamed version. If you wish, you can also run this
code with the Nsight Systems profiler using the above given command.
This will generate a visual output, and you should be able to confirm that
there is indeed overlap of operations by zooming in on the portion of
execution related to kernel launches. You can see the non-overlapped
version run, followed by the overlapped version. Not only should the
overlapped version be faster, you should see an interleaving of
computation and data transfer operations.
In this exercise, you are given a very simple code that performs 4 kernel
calls in sequence on a single GPU. You're welcome to compile and run the
code as-is. It will display an overall duration for the time taken to complete
the 4 kernel calls. Your task is to modify this code to run each kernel on a
separate GPU (each node on Summit actually has 6 GPUs). After
completion, confirm that the execution time is substantially reduced.
```
alias lsfrun='bsub -W 10 -nnodes 1 -P <allocation_ID> -Is jsrun -n1 -a1 -c1
-g4'
lsfrun ./multi
```
On Cori, make sure that you ask for an allocation with 4 GPUs, e.g.
```
srun -C gpu -N 1 -n 1 -t 10 -A m3502 -G 4 -c 40 ./multi
```
**HINT**: This exercise might be simpler than you think. You won't need to
do anything with streams at all for this. You'll only need to make a simple
modification to each of the for-loops.
// modifiable
typedef float ft;
const int chunks = 64;
const size_t ds = 1024*1024*chunks;
const int count = 22;
const int num_gpus = 4;
// not modifiable
const float sqrt_2PIf = 2.5066282747946493232942230134974f;
const double sqrt_2PI = 2.5066282747946493232942230134974;
__device__ float gpdf(float val, float sigma) {
return expf(-0.5f * val * val) / (sigma * sqrt_2PIf);
}
// compute average gaussian pdf value over a window around each point
__global__ void gaussian_pdf(const ft * __restrict__ x, ft * __restrict__ y,
const ft mean, const ft sigma, const int n) {
int idx = threadIdx.x + blockDim.x * blockIdx.x;
if (idx < n) {
ft in = x[idx] - (count / 2) * 0.01f;
ft out = 0;
for (int i = 0; i < count; i++) {
ft temp = (in - mean) / sigma;
out += gpdf(temp, sigma);
in += 0.01f;
}
y[idx] = out / count;
}
}
// host-based timing
#define USECPSEC 1000000ULL
int main() {
ft *h_x, *d_x[num_gpus], *d_y[num_gpus];
h_x = (ft *)malloc(ds * sizeof(ft));
et1 = dtime_usec(et1);
std::cout << "elapsed time: " << et1/(float)USECPSEC << std::endl;
return 0;
}
#include <math.h>
#include <iostream>
#include <time.h>
#include <sys/time.h>
#include <stdio.h>
// modifiable
typedef float ft;
const int chunks = 64;
const size_t ds = 1024*1024*chunks;
const int count = 22;
const int num_gpus = 4;
// not modifiable
const float sqrt_2PIf = 2.5066282747946493232942230134974f;
const double sqrt_2PI = 2.5066282747946493232942230134974;
__device__ float gpdf(float val, float sigma) {
return expf(-0.5f * val * val) / (sigma * sqrt_2PIf);
}
// host-based timing
#define USECPSEC 1000000ULL
int main() {
ft *h_x, *d_x[num_gpus], *d_y[num_gpus];
h_x = (ft *)malloc(ds * sizeof(ft));
et1 = dtime_usec(et1);
std::cout << "elapsed time: " << et1/(float)USECPSEC << std::endl;
return 0;
}
#include <math.h>
#include <iostream>
#include <time.h>
#include <sys/time.h>
#include <stdio.h>
// modifiable
typedef float ft;
const int chunks = 64;
const size_t ds = 1024*1024*chunks;
const int count = 22;
const int num_streams = 8;
// not modifiable
const float sqrt_2PIf = 2.5066282747946493232942230134974f;
const double sqrt_2PI = 2.5066282747946493232942230134974;
__device__ float gpdf(float val, float sigma) {
return expf(-0.5f * val * val) / (sigma * sqrt_2PIf);
}
// compute average gaussian pdf value over a window around each point
__global__ void gaussian_pdf(const ft * __restrict__ x, ft * __restrict__ y,
const ft mean, const ft sigma, const int n) {
int idx = threadIdx.x + blockDim.x * blockIdx.x;
if (idx < n) {
ft in = x[idx] - (count / 2) * 0.01f;
ft out = 0;
for (int i = 0; i < count; i++) {
ft temp = (in - mean) / sigma;
out += gpdf(temp, sigma);
in += 0.01f;
}
y[idx] = out / count;
}
}
// host-based timing
#define USECPSEC 1000000ULL
int main() {
ft *h_x, *d_x, *h_y, *h_y1, *d_y;
cudaHostAlloc(&h_x, ds*sizeof(ft), cudaHostAllocDefault);
cudaHostAlloc(&h_y, ds*sizeof(ft), cudaHostAllocDefault);
cudaHostAlloc(&h_y1, ds*sizeof(ft), cudaHostAllocDefault);
cudaMalloc(&d_x, ds*sizeof(ft));
cudaMalloc(&d_y, ds*sizeof(ft));
cudaCheckErrors("allocation error");
cudaStream_t streams[num_streams];
for (int i = 0; i < num_streams; i++) {
cudaStreamCreate(&streams[i]);
}
cudaCheckErrors("stream creation error");
et1 = dtime_usec(et1);
std::cout << "non-stream elapsed time: " << et1/(float)USECPSEC <<
std::endl;
#ifdef USE_STREAMS
cudaMemset(d_y, 0, ds * sizeof(ft));
et = dtime_usec(et);
return 0;
}
#include <math.h>
#include <iostream>
#include <time.h>
#include <sys/time.h>
#include <stdio.h>
// modifiable
typedef float ft;
const int chunks = 64;
const size_t ds = 1024*1024*chunks;
const int count = 22;
const int num_streams = 8;
// not modifiable
const float sqrt_2PIf = 2.5066282747946493232942230134974f;
const double sqrt_2PI = 2.5066282747946493232942230134974;
__device__ float gpdf(float val, float sigma) {
return expf(-0.5f * val * val) / (sigma * sqrt_2PIf);
}
// compute average gaussian pdf value over a window around each point
__global__ void gaussian_pdf(const ft * __restrict__ x, ft * __restrict__ y,
const ft mean, const ft sigma, const int n) {
int idx = threadIdx.x + blockDim.x * blockIdx.x;
if (idx < n) {
ft in = x[idx] - (count / 2) * 0.01f;
ft out = 0;
for (int i = 0; i < count; i++) {
ft temp = (in - mean) / sigma;
out += gpdf(temp, sigma);
in += 0.01f;
}
y[idx] = out / count;
}
}
// host-based timing
#define USECPSEC 1000000ULL
int main() {
ft *h_x, *d_x, *h_y, *h_y1, *d_y;
cudaHostAlloc(&h_x, ds*sizeof(ft), cudaHostAllocDefault);
cudaHostAlloc(&h_y, ds*sizeof(ft), cudaHostAllocDefault);
cudaHostAlloc(&h_y1, ds*sizeof(ft), cudaHostAllocDefault);
cudaMalloc(&d_x, ds*sizeof(ft));
cudaMalloc(&d_y, ds*sizeof(ft));
cudaCheckErrors("allocation error");
cudaStream_t streams[num_streams];
for (int i = 0; i < num_streams; i++) {
cudaStreamCreate(&streams[i]);
}
cudaCheckErrors("stream creation error");
gaussian_pdf<<<(ds + 255) / 256, 256>>>(d_x, d_y, 0.0, 1.0, ds); //
warm-up
et1 = dtime_usec(et1);
std::cout << "non-stream elapsed time: " << et1/(float)USECPSEC <<
std::endl;
#ifdef USE_STREAMS
cudaMemset(d_y, 0, ds * sizeof(ft));
return 0;
}
This excercise, in 3 parts, is designed to walk you through a Nsight
Compute-driven analysis-driven optimization sequence. The overall
exercise is focused on optimizing square matrix transpose. This operation
can be simply described as:
Bij = Aji
for input matrix A, output matrix B, and indices i and j varying over the
square matrix side dimension. This algorithm involves no compute activity,
therefore it is a memory bound algorithm, and our final objective will be to
come as close as possible to the available memory bandwidth of the GPU
we are running on.
For your first task, change into the *task1* directory. There you should edit
the *task1.cu* file to complete the matrix transpose operation. Most of the
code is written for you, but replace the **FIXME** entries with the proper
code to complete the matrix transpose using global memory. The formula
given above should guide your efforts. Here are some hints:
- Each thread reads from (row, col) and writes to (col, row)
- Using indexing macro:
```cpp
#define INDX( row, col, ld ) ( ( (row) * (ld) ) + (col) )
ld = leading dimension (width)
```
If you need help, you can refer to the *task1_solution.cu* file. Then
compile and test your code:
```bash
module load cuda
./build_nvcc
```
The module load command selects a CUDA compiler for your use. The
module load command only needs to be done once per session/login.
*nvcc* is the CUDA compiler invocation command. The syntax is generally
similar to gcc/g++.
```bash
bsub -W 10 -nnodes 1 -P <allocation_ID> -Is jsrun -n1 -a1 -c1 -g1 ./task1
```
Alternatively, you may want to create an alias for your bsub command in
order to make subsequent runs easier:
```bash
alias lsfrun='bsub -W 10 -nnodes 1 -P <allocation_ID> -Is jsrun -n1 -a1 -c1
-g1'
lsfrun ./task1
```
```bash
module load esslurm
srun -C gpu -N 1 -n 1 -t 10 -A m3502 --gres=gpu:1 -c 10 ./task1
```
If you prefer, you can instead reserve a GPU in an interactive session, and
then run an executable any number of times while the Slurm allocation is
active (this is recommended if there are enough available nodes):
```bash
salloc -C gpu -N 1 -t 60 -A m3502 --gres=gpu:1 -c 10
srun -n 1 ./task1
```
Note that you only need to `module load esslurm` once per login session;
this is what enables you to submit to the Cori GPU nodes.
One you have a PASS result, begin the first round of analysis by running
the profiler:
```bash
module load nsight-compute
lsfrun nv-nsight-cu-cli --metrics
l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum,l1tex__t_requests_pipe
_lsu_mem_global_op_ld.sum,l1tex__average_t_sectors_per_request_pipe
_lsu_mem_global_op_ld.ratio,l1tex__t_sectors_pipe_lsu_mem_global_op_
st.sum,l1tex__t_requests_pipe_lsu_mem_global_op_st.sum,l1tex__averag
e_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio,smsp__sass_
average_data_bytes_per_sector_mem_global_op_ld.pct,smsp__sass_aver
age_data_bytes_per_sector_mem_global_op_st.pct ./task1
```
Considering the output of the profiler, are the Global Load Efficiency and
Global Store Efficiency both at 100%? Why or why not? This may be a
good time to study the load and store indexing carefully, and review the
global coalescing rules learned in Homework 4.
Change from directory *task1* to *task2*. Edit the *task2.cu* file, wherever
the **FIXME** occurs, to achieve the above two operations. If you need
help, refer to the *task2_solution.cu* file. This is the hardest programming
assignment of the 3 tasks in this exercise.
```bash
./build_nvcc
lsfrun ./task2
```
You should get a PASS output. Has the measured bandwidth improved?
Once again we will use the profiler to help explain our observations. We
have introduced shared memory operations into our algorithm, so we will
include shared memory measure metrics in our profiling:
```bash
lsfrun nv-nsight-cu-cli --metrics
l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum,l1tex__t_requests_pipe
_lsu_mem_global_op_ld.sum,l1tex__average_t_sectors_per_request_pipe
_lsu_mem_global_op_ld.ratio,l1tex__t_sectors_pipe_lsu_mem_global_op_
st.sum,l1tex__t_requests_pipe_lsu_mem_global_op_st.sum,l1tex__averag
e_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio,smsp__sass_
average_data_bytes_per_sector_mem_global_op_ld.pct,smsp__sass_aver
age_data_bytes_per_sector_mem_global_op_st.pct,l1tex__data_pipe_lsu_
wavefronts_mem_shared_op_ld.sum,l1tex__data_pipe_lsu_wavefronts_m
em_shared_op_st.sum,l1tex__data_bank_conflicts_pipe_lsu_mem_shared
_op_ld.sum,l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum
,smsp__sass_average_data_bytes_per_wavefront_mem_shared.pct
./task2
```
- *l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum*: The
number of shared load transactions
- *l1tex__data_pipe_lsu_wavefronts_mem_shared_op_st.sum*: The
number of shared store transactions
- *l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum*: The
number of shared load bank conflicts
- *l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum*: The
number of shared store bank conflicts
- *smsp__sass_average_data_bytes_per_wavefront_mem_shared.pct*:
Shared Memory efficiency
You should be able to confirm that the previous global load/global store
efficiency issues have been resolved, with proper coalescing. However
now we have a problem with shared memory: bank conflicts. Review
module 4 information on bank conflicts, for a basic definition of how these
arise during shared memory access.
Our strategy to fix shared memory bank conflicts in this case is fairly
simple. We will leave the shared memory indexing unchanged from
exercise 2, but we will add a column to the shared memory definition in our
code. This will allow both row-wise and columnar access to shared memory
(needed for our in-tile transpose step) without bank conflicts.
You should get a passing result. Has the achieved bandwidth improved?
You can profile your code to confirm that we are now using shared memory
in an efficient fashion, for both loads and stores.
```bash
lsfrun nv-nsight-cu-cli --metrics
l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum,l1tex__t_requests_pipe
_lsu_mem_global_op_ld.sum,l1tex__average_t_sectors_per_request_pipe
_lsu_mem_global_op_ld.ratio,l1tex__t_sectors_pipe_lsu_mem_global_op_
st.sum,l1tex__t_requests_pipe_lsu_mem_global_op_st.sum,l1tex__averag
e_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio,smsp__sass_
average_data_bytes_per_sector_mem_global_op_ld.pct,smsp__sass_aver
age_data_bytes_per_sector_mem_global_op_st.pct,l1tex__data_pipe_lsu_
wavefronts_mem_shared_op_ld.sum,l1tex__data_pipe_lsu_wavefronts_m
em_shared_op_st.sum,l1tex__data_bank_conflicts_pipe_lsu_mem_shared
_op_ld.sum,l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum
,smsp__sass_average_data_bytes_per_wavefront_mem_shared.pct
./task3
```
nv-nsight-cu-cli --metrics
l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum,l1tex__t_requests_pipe
_lsu_mem_global_op_ld.sum,l1tex__t_sectors_pipe_lsu_mem_global_op_
st.sum,l1tex__t_requests_pipe_lsu_mem_global_op_st.sum ./task1
/*
* Copyright 2014 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <stdio.h>
#ifdef DEBUG
#define CUDA_CALL(F) if( (F) != cudaSuccess ) \
{printf("Error %s at %s:%d\n", cudaGetErrorString(cudaGetLastError()), \
__FILE__,__LINE__); exit(-1);}
#define CUDA_CHECK() if( (cudaPeekAtLastError()) != cudaSuccess ) \
{printf("Error %s at %s:%d\n", cudaGetErrorString(cudaGetLastError()), \
__FILE__,__LINE__-1); exit(-1);}
#else
#define CUDA_CALL(F) (F)
#define CUDA_CHECK()
#endif
#define THREADS_PER_BLOCK_X 32
#define THREADS_PER_BLOCK_Y 32
} /* end naive_cuda_transpose */
} /* end host_dgemm */
/* start timers */
CUDA_CALL( cudaEventRecord( start, 0 ) );
end:
free( h_a );
free( h_c );
CUDA_CALL( cudaFree( d_a ) );
CUDA_CALL( cudaFree( d_c ) );
CUDA_CALL( cudaDeviceReset() );
return 0;
} /* end main */
/*
* Copyright 2014 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <stdio.h>
#ifdef DEBUG
#define CUDA_CALL(F) if( (F) != cudaSuccess ) \
{printf("Error %s at %s:%d\n", cudaGetErrorString(cudaGetLastError()), \
__FILE__,__LINE__); exit(-1);}
#define CUDA_CHECK() if( (cudaPeekAtLastError()) != cudaSuccess ) \
{printf("Error %s at %s:%d\n", cudaGetErrorString(cudaGetLastError()), \
__FILE__,__LINE__-1); exit(-1);}
#else
#define CUDA_CALL(F) (F)
#define CUDA_CHECK()
#endif
#define THREADS_PER_BLOCK_X 32
#define THREADS_PER_BLOCK_Y 32
} /* end naive_cuda_transpose */
} /* end host_dgemm */
/* start timers */
CUDA_CALL( cudaEventRecord( start, 0 ) );
end:
free( h_a );
free( h_c );
CUDA_CALL( cudaFree( d_a ) );
CUDA_CALL( cudaFree( d_c ) );
CUDA_CALL( cudaDeviceReset() );
return 0;
} /* end main */
nv-nsight-cu-cli --metrics
l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum,l1tex__t_requests_pipe
_lsu_mem_global_op_ld.sum,l1tex__t_sectors_pipe_lsu_mem_global_op_
st.sum,l1tex__t_requests_pipe_lsu_mem_global_op_st.sum,l1tex__data_pi
pe_lsu_wavefronts_mem_shared_op_ld.sum,l1tex__data_pipe_lsu_wavefr
onts_mem_shared_op_st.sum,l1tex__data_bank_conflicts_pipe_lsu_mem_
shared_op_ld.sum,l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op
_st.sum,smsp__cycles_active.avg.pct_of_peak_sustained_elapsed ./task2
/*
* Copyright 2014 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <stdio.h>
#include <math.h>
#ifdef DEBUG
#define CUDA_CALL(F) if( (F) != cudaSuccess ) \
{printf("Error %s at %s:%d\n", cudaGetErrorString(cudaGetLastError()), \
__FILE__,__LINE__); exit(-1);}
#define CUDA_CHECK() if( (cudaPeekAtLastError()) != cudaSuccess ) \
{printf("Error %s at %s:%d\n", cudaGetErrorString(cudaGetLastError()), \
__FILE__,__LINE__-1); exit(-1);}
#else
#define CUDA_CALL(F) (F)
#define CUDA_CHECK()
#endif
#define THREADS_PER_BLOCK_X 32
#define THREADS_PER_BLOCK_Y 32
smemArray[FIXME][FIXME] =
a[FIXME];
} /* end if */
/* synchronize */
__syncthreads();
if( myRow < m && myCol < m )
{
/* write the result */
c[FIXME] =
smemArray[FIXME][FIXME];
} /* end if */
return;
} /* end smem_cuda_transpose */
} /* end host_dgemm */
/* start timers */
CUDA_CALL( cudaEventRecord( start, 0 ) );
end:
free( h_a );
free( h_c );
CUDA_CALL( cudaFree( d_a ) );
CUDA_CALL( cudaFree( d_c ) );
CUDA_CALL( cudaDeviceReset() );
return 0;
}
/*
* Copyright 2014 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <stdio.h>
#include <math.h>
#ifdef DEBUG
#define CUDA_CALL(F) if( (F) != cudaSuccess ) \
{printf("Error %s at %s:%d\n", cudaGetErrorString(cudaGetLastError()), \
__FILE__,__LINE__); exit(-1);}
#define CUDA_CHECK() if( (cudaPeekAtLastError()) != cudaSuccess ) \
{printf("Error %s at %s:%d\n", cudaGetErrorString(cudaGetLastError()), \
__FILE__,__LINE__-1); exit(-1);}
#else
#define CUDA_CALL(F) (F)
#define CUDA_CHECK()
#endif
#define THREADS_PER_BLOCK_X 32
#define THREADS_PER_BLOCK_Y 32
smemArray[threadIdx.x][threadIdx.y] =
a[INDX( tileX + threadIdx.x, tileY + threadIdx.y, m )];
} /* end if */
/* synchronize */
__syncthreads();
if( myRow < m && myCol < m )
{
/* write the result */
c[INDX( tileY + threadIdx.x, tileX + threadIdx.y, m )] =
smemArray[threadIdx.y][threadIdx.x];
} /* end if */
return;
} /* end smem_cuda_transpose */
} /* end host_dgemm */
/* start timers */
CUDA_CALL( cudaEventRecord( start, 0 ) );
end:
free( h_a );
free( h_c );
CUDA_CALL( cudaFree( d_a ) );
CUDA_CALL( cudaFree( d_c ) );
CUDA_CALL( cudaDeviceReset() );
return 0;
}
nv-nsight-cu-cli --metrics
l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum,l1tex__t_requests_pipe
_lsu_mem_global_op_ld.sum,l1tex__t_sectors_pipe_lsu_mem_global_op_
st.sum,l1tex__t_requests_pipe_lsu_mem_global_op_st.sum,l1tex__data_pi
pe_lsu_wavefronts_mem_shared_op_ld.sum,l1tex__data_pipe_lsu_wavefr
onts_mem_shared_op_st.sum,l1tex__data_bank_conflicts_pipe_lsu_mem_
shared_op_ld.sum,l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op
_st.sum,smsp__cycles_active.avg.pct_of_peak_sustained_elapsed,l1tex__
t_bytes_pipe_lsu_mem_global_op_st.sum.per_second,l1tex__t_bytes_pipe
_lsu_mem_global_op_ld.sum.per_second ./task3
/*
* Copyright 2014 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <stdio.h>
#include <math.h>
#ifdef DEBUG
#define CUDA_CALL(F) if( (F) != cudaSuccess ) \
{printf("Error %s at %s:%d\n", cudaGetErrorString(cudaGetLastError()), \
__FILE__,__LINE__); exit(-1);}
#define CUDA_CHECK() if( (cudaPeekAtLastError()) != cudaSuccess ) \
{printf("Error %s at %s:%d\n", cudaGetErrorString(cudaGetLastError()), \
__FILE__,__LINE__-1); exit(-1);}
#else
#define CUDA_CALL(F) (F)
#define CUDA_CHECK()
#endif
#define THREADS_PER_BLOCK_X 32
#define THREADS_PER_BLOCK_Y 32
__shared__ double
smemArray[THREADS_PER_BLOCK_X][THREADS_PER_BLOCK_Y];
/* determine my row and column indices for the error checking code */
smemArray[threadIdx.x][threadIdx.y] =
a[INDX( tileX + threadIdx.x, tileY + threadIdx.y, m )];
} /* end if */
/* synchronize */
__syncthreads();
if( myRow < m && myCol < m )
{
/* write the result */
c[INDX( tileY + threadIdx.x, tileX + threadIdx.y, m )] =
smemArray[threadIdx.y][threadIdx.x];
} /* end if */
return;
} /* end smem_cuda_transpose */
} /* end host_dgemm */
/* start timers */
CUDA_CALL( cudaEventRecord( start, 0 ) );
end:
free( h_a );
free( h_c );
CUDA_CALL( cudaFree( d_a ) );
CUDA_CALL( cudaFree( d_c ) );
CUDA_CALL( cudaDeviceReset() );
return 0;
}
/*
* Copyright 2014 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <stdio.h>
#include <math.h>
#ifdef DEBUG
#define CUDA_CALL(F) if( (F) != cudaSuccess ) \
{printf("Error %s at %s:%d\n", cudaGetErrorString(cudaGetLastError()), \
__FILE__,__LINE__); exit(-1);}
#define CUDA_CHECK() if( (cudaPeekAtLastError()) != cudaSuccess ) \
{printf("Error %s at %s:%d\n", cudaGetErrorString(cudaGetLastError()), \
__FILE__,__LINE__-1); exit(-1);}
#else
#define CUDA_CALL(F) (F)
#define CUDA_CHECK()
#endif
#define THREADS_PER_BLOCK_X 32
#define THREADS_PER_BLOCK_Y 32
smemArray[threadIdx.x][threadIdx.y] =
a[INDX( tileX + threadIdx.x, tileY + threadIdx.y, m )];
} /* end if */
/* synchronize */
__syncthreads();
if( myRow < m && myCol < m )
{
/* write the result */
c[INDX( tileY + threadIdx.x, tileX + threadIdx.y, m )] =
smemArray[threadIdx.y][threadIdx.x];
} /* end if */
return;
} /* end smem_cuda_transpose */
} /* end host_dgemm */
/* start timers */
CUDA_CALL( cudaEventRecord( start, 0 ) );
end:
free( h_a );
free( h_c );
CUDA_CALL( cudaFree( d_a ) );
CUDA_CALL( cudaFree( d_c ) );
CUDA_CALL( cudaDeviceReset() );
return 0;
}
First, you should take the *task1.cu* code, and complete the sections
indicated by **FIXME** to provide a proper thread-block group, and assign
that group to the group being used for printout purposes. You should only
need to modify the 2 lines containing **FIXME** for this first step.
You can compile your code as follows:
```bash
module load cuda
nvcc -arch=sm_70 -o task1 task1.cu -std=c++11
```
The module load command selects a CUDA compiler for your use. The
module load command only needs to be done once per session/login.
*nvcc* is the CUDA compiler invocation command. The syntax is generally
similar to gcc/g++. Note that because we're using C++11 (which is required
for cooperative groups) we need a sufficiently modern compiler (gcc >= 5
should be sufficient). If you're on Summit, make sure to do `module load
gcc` because the system default gcc is not recent enough.
```bash
bsub -W 10 -nnodes 1 -P <allocation_ID> -Is jsrun -n1 -a1 -c1 -g1 ./task1
```
Alternatively, you may want to create an alias for your bsub command in
order to make subsequent runs easier:
```bash
alias lsfrun='bsub -W 10 -nnodes 1 -P <allocation_ID> -Is jsrun -n1 -a1 -c1
-g1'
lsfrun ./task1
```
```bash
module load esslurm
srun -C gpu -N 1 -n 1 -t 10 -A m3502 --gres=gpu:1 -c 10 ./task1
```
If you prefer, you can instead reserve a GPU in an interactive session, and
then run an executable any number of times while the Slurm allocation is
active (this is recommended if there are enough available nodes):
```bash
salloc -C gpu -N 1 -t 60 -A m3502 --gres=gpu:1 -c 10
srun -n 1 ./task1
```
Note that you only need to `module load esslurm` once per login session;
this is what enables you to submit to the Cori GPU nodes.
```bash
group partial sum: 256
```
If you need help, refer to the *task1_solution1.cu* file. (which contains the
solution for tasks 1a, 1b, and 1c)
Next uncomment the next line that starts with the auto keyword, and
complete that line to use the previously created thread block group and
subdivide it into a set of 32-thread partitions, using the dynamic (runtime)
partitioning method.
Compile and run the code as above. correct output should look like:
```bash
group partial sum: 32
group partial sum: 32
group partial sum: 32
group partial sum: 32
group partial sum: 32
group partial sum: 32
group partial sum: 32
group partial sum: 32
```
Compile and run the code as above. Correct output should look like:
```bash
group partial sum: 16
group partial sum: 16
group partial sum: 16
group partial sum: 16
group partial sum: 16
group partial sum: 16
group partial sum: 16
group partial sum: 16
group partial sum: 16
group partial sum: 16
group partial sum: 16
group partial sum: 16
group partial sum: 16
group partial sum: 16
group partial sum: 16
group partial sum: 16
```
```bash
343705080004
```
```bash
3437584
```
Like many reduction type algorithms (the output here is potentially much
smaller than the input), we can easily imagine how to do this in a serial
fashion, but a fast parallel stream compaction requires some additional
thought. A common approach is to use a prefix sum. A prefix sum is a
data set, where each data item in the set represents the sum of the
previous input elements from the beginning of the input to that point. We
can use a prefix sum to help parallelize our stream compaction. We start
by creating an array of ones and zeroes, where there is a one
corresponding to the element we want to keep, and zero for the element we
want to discard:
```bash
3 4 3 7 0 5 0 8 0 0 0 4 (input data)
1 1 1 1 0 1 0 1 0 0 0 1 (filtering of input)
```
```bash
3 4 3 7 0 5 0 8 0 0 0 4 (input data)
1 1 1 1 0 1 0 1 0 0 0 1 (filtering of input)
0 1 2 3 4 4 5 5 6 6 6 6 (exclusive prefix sum of filtered data)
```
This prefix sum now contains the index into the output array that the input
position should be copied to. We only copy a position from input to output if
the corresponding filter element is not zero. This demonstrates how to use
a prefix sum to assist with a stream compaction, but doesn't identify how to
do the prefix sum in parallel, efficiently. A full treatment here is beyond the
scope of this document, but you can refer here for a good treatise:
https://people.eecs.berkeley.edu/~driscoll/cs267/papers/gpugems3_ch39.ht
ml Some key takeaways are that a prefix sum has a sweeping operation,
not unlike the sweeping operation that is successively performed in a
parallel reduction, but there are key differences. Two of these key
differences are that the sweep is from "left" to "right" in the prefix sum
whereas it is usually from right to left in a typical parallel reduction, and
also that the break points (i.e. the division of threads participating at each
sweep phase) is different.
When parallelizing a prefix sum, we often require multiple phases, for
example a thread-block level scan (prefix-sum) operation, followed by
another operation to "fix up" the threadblock level results based on the data
from other ("previous") thread blocks. These phases may require a
grid-wide sync, and typical scan from a library such as thrust will use
multiple kernel calls. Let's see if we can do it in a single kernel call. You
won't have to write any scan code, other than inserting appropriate
cooperative group sync points. We need sync points at the threadblock
leve (based on the threadblock level group created for you) and also at the
grid level.
Once you have made the above modification, compile your code as follows:
```bash
nvcc -arch=sm_70 -o task2 task2.cu -rdc=true -std=c++11
```
```bash
lsfrun ./task2
```
```bash
number of SMs = 80
number of blocks per SM = 8
kernel time: 0.043872ms
thrust time: 0.083200ms
```
The code has silent validation built in, so no actual results are printed, other
than the above informational data. If you got a "*mismatch*" message,
something is wrong with your implementation.
Optional:
```bash
module load nsight-compute
lsfrun nv-nsight-cu-cli ./task2
```
Now make the data set larger. A reasonable upper limit might be 32M
elements. Make sure to chose a number that is divisble by 256, the
threadblock size. For example, change:
```cpp
const int test_dsize = 256;
```
to something like:
```cpp
const int test_dsize = 1048576*16;
```
and recompile and rerun the code. Now which is faster, thrust or our naive
code?
Takeaway: don't write your own code if you can find a high-quality library
implementation. This is especially true for more complex algorithms like
sorting, prefix sums, and matrix multiply.
#include <cooperative_groups.h>
#include <stdio.h>
using namespace cooperative_groups;
const int nTPB = 256;
__device__ int reduce(thread_group g, int *x, int val) {
int lane = g.thread_rank();
for (int i = g.size()/2; i > 0; i /= 2) {
x[lane] = val; g.sync();
if (lane < i) val += x[lane + i]; g.sync();
}
if (g.thread_rank() == 0) printf("group partial sum: %d\n", val);
return val;
}
int main(){
int *data;
cudaMallocManaged(&data, nTPB*sizeof(data[0]));
for (int i = 0; i < nTPB; i++) data[i] = 1;
my_reduce_kernel<<<1,nTPB>>>(data);
cudaError_t err = cudaDeviceSynchronize();
if (err != cudaSuccess) printf("cuda error: %s\n", cudaGetErrorString(err));
}
#include <cooperative_groups.h>
#include <stdio.h>
using namespace cooperative_groups;
const int nTPB = 256;
__device__ int reduce(thread_group g, int *x, int val) {
int lane = g.thread_rank();
for (int i = g.size()/2; i > 0; i /= 2) {
x[lane] = val; g.sync();
if (lane < i) val += x[lane + i]; g.sync();
}
if (g.thread_rank() == 0) printf("group partial sum: %d\n", val);
return val;
}
int main(){
int *data;
cudaMallocManaged(&data, nTPB*sizeof(data[0]));
for (int i = 0; i < nTPB; i++) data[i] = 1;
my_reduce_kernel<<<1,nTPB>>>(data);
cudaError_t err = cudaDeviceSynchronize();
if (err != cudaSuccess) printf("cuda error: %s\n", cudaGetErrorString(err));
}
#include <stdio.h>
#include <stdlib.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/remove.h>
#include <cooperative_groups.h>
int main(){
// data setup
mytype *d_idata, *d_odata, *h_data;
unsigned *d_idxs;
size_t tsize = ((size_t)test_dsize)*sizeof(mytype);
h_data = (mytype *)malloc(tsize);
cudaMalloc(&d_idata, tsize);
cudaMalloc(&d_odata, tsize);
cudaMemset(d_odata, 0, tsize);
cudaMalloc(&d_idxs, test_dsize*sizeof(unsigned));
// check for support and device configuration
// and calculate maximum grid size
cudaDeviceProp prop;
cudaError_t err = cudaGetDeviceProperties(&prop, 0);
if (err != cudaSuccess) {printf("cuda error: %s\n",
cudaGetErrorString(err)); return 0;}
if (prop.cooperativeLaunch == 0) {printf("cooperative launch not
supported\n"); return 0;}
int numSM = prop.multiProcessorCount;
printf("number of SMs = %d\n", numSM);
int numBlkPerSM;
cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlkPerSM,
my_remove_if<mytype>, nTPB, 0);
printf("number of blocks per SM = %d\n", numBlkPerSM);
// test 1: no remove values
for (int i = 0; i < test_dsize; i++) h_data[i] = i;
cudaMemcpy(d_idata, h_data, tsize, cudaMemcpyHostToDevice);
cudaStream_t str;
cudaStreamCreate(&str);
mytype remove_val = -1;
unsigned ds = test_dsize;
void *args[] = {(void *)&d_idata, (void *)&remove_val, (void *)&d_odata,
(void *)&d_idxs, (void *)&ds};
dim3 grid(numBlkPerSM*numSM);
dim3 block(nTPB);
cudaLaunchCooperativeKernel((void *)my_remove_if<mytype>, FIXME);
err = cudaMemcpy(h_data, d_odata, tsize, cudaMemcpyDeviceToHost);
if (err != cudaSuccess) {printf("cuda error: %s\n",
cudaGetErrorString(err)); return 0;}
//validate
for (int i = 0; i < test_dsize; i++) if (h_data[i] != i){printf("mismatch 1 at %d,
was: %d, should be: %d\n", i, h_data[i], i); return 1;}
// test 2: with remove values
int val = 0;
for (int i = 0; i < test_dsize; i++){
if ((rand()/(float)RAND_MAX) > 0.5) h_data[i] = val++;
else h_data[i] = -1;}
thrust::device_vector<mytype> t_data(h_data, h_data+test_dsize);
cudaMemcpy(d_idata, h_data, tsize, cudaMemcpyHostToDevice);
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
cudaLaunchCooperativeKernel((void *)my_remove_if<mytype>, FIXME);
cudaEventRecord(stop);
float et;
cudaMemcpy(h_data, d_odata, tsize, cudaMemcpyDeviceToHost);
cudaEventElapsedTime(&et, start, stop);
//validate
for (int i = 0; i < val; i++) if (h_data[i] != i){printf("mismatch 2 at %d, was:
%d, should be: %d\n", i, h_data[i], i); return 1;}
printf("kernel time: %fms\n", et);
cudaEventRecord(start);
thrust::remove(t_data.begin(), t_data.end(), -1);
cudaEventRecord(stop);
thrust::host_vector<mytype> th_data = t_data;
// validate
for (int i = 0; i < val; i++) if (h_data[i] != th_data[i]){printf("mismatch 3 at
%d, was: %d, should be: %d\n", i, th_data[i], h_data[i]); return 1;}
cudaEventElapsedTime(&et, start, stop);
printf("thrust time: %fms\n", et);
return 0;
}
#include <stdio.h>
#include <stdlib.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/remove.h>
#include <cooperative_groups.h>
int main(){
// data setup
mytype *d_idata, *d_odata, *h_data;
unsigned *d_idxs;
size_t tsize = ((size_t)test_dsize)*sizeof(mytype);
h_data = (mytype *)malloc(tsize);
cudaMalloc(&d_idata, tsize);
cudaMalloc(&d_odata, tsize);
cudaMemset(d_odata, 0, tsize);
cudaMalloc(&d_idxs, test_dsize*sizeof(unsigned));
// check for support and device configuration
// and calculate maximum grid size
cudaDeviceProp prop;
cudaError_t err = cudaGetDeviceProperties(&prop, 0);
if (err != cudaSuccess) {printf("cuda error: %s\n",
cudaGetErrorString(err)); return 0;}
if (prop.cooperativeLaunch == 0) {printf("cooperative launch not
supported\n"); return 0;}
int numSM = prop.multiProcessorCount;
printf("number of SMs = %d\n", numSM);
int numBlkPerSM;
cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlkPerSM,
my_remove_if<mytype>, nTPB, 0);
printf("number of blocks per SM = %d\n", numBlkPerSM);
// test 1: no remove values
for (int i = 0; i < test_dsize; i++) h_data[i] = i;
cudaMemcpy(d_idata, h_data, tsize, cudaMemcpyHostToDevice);
cudaStream_t str;
cudaStreamCreate(&str);
mytype remove_val = -1;
unsigned ds = test_dsize;
void *args[] = {(void *)&d_idata, (void *)&remove_val, (void *)&d_odata,
(void *)&d_idxs, (void *)&ds};
dim3 grid(numBlkPerSM*numSM);
dim3 block(nTPB);
cudaLaunchCooperativeKernel((void *)my_remove_if<mytype>, grid,
block, args, 0, str);
err = cudaMemcpy(h_data, d_odata, tsize, cudaMemcpyDeviceToHost);
if (err != cudaSuccess) {printf("cuda error: %s\n",
cudaGetErrorString(err)); return 0;}
//validate
for (int i = 0; i < test_dsize; i++) if (h_data[i] != i){printf("mismatch 1 at %d,
was: %d, should be: %d\n", i, h_data[i], i); return 1;}
// test 2: with remove values
int val = 0;
for (int i = 0; i < test_dsize; i++){
if ((rand()/(float)RAND_MAX) > 0.5) h_data[i] = val++;
else h_data[i] = -1;}
thrust::device_vector<mytype> t_data(h_data, h_data+test_dsize);
cudaMemcpy(d_idata, h_data, tsize, cudaMemcpyHostToDevice);
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
cudaLaunchCooperativeKernel((void *)my_remove_if<mytype>, grid,
block, args, 0, str);
cudaEventRecord(stop);
float et;
cudaMemcpy(h_data, d_odata, tsize, cudaMemcpyDeviceToHost);
cudaEventElapsedTime(&et, start, stop);
//validate
for (int i = 0; i < val; i++) if (h_data[i] != i){printf("mismatch 2 at %d, was:
%d, should be: %d\n", i, h_data[i], i); return 1;}
printf("kernel time: %fms\n", et);
cudaEventRecord(start);
thrust::remove(t_data.begin(), t_data.end(), -1);
cudaEventRecord(stop);
thrust::host_vector<mytype> th_data = t_data;
// validate
for (int i = 0; i < val; i++) if (h_data[i] != th_data[i]){printf("mismatch 3 at
%d, was: %d, should be: %d\n", i, th_data[i], h_data[i]); return 1;}
cudaEventElapsedTime(&et, start, stop);
printf("thrust time: %fms\n", et);
return 0;
}
For your first task, you are given a code that performs a silly computation
element-wise on a vector. We already implemented a chunked version of
this code using multiple CUDA streams in Homework 7. Let's start by
reviewing the performance impact that CUDA streams had on this code.
Compile it using the following:
```
module load cuda/11.4.0
nvcc -o streams streams.cu -DUSE_STREAMS
```
The module load command selects a CUDA compiler for your use. The
module load command only needs to be done once per session/login.
*nvcc* is the CUDA compiler invocation command. The syntax is generally
similar to gcc/g++.
```
bsub -W 10 -nnodes 1 -P <allocation_ID> -Is jsrun -n1 -a1 -c1 -g1 ./streams
```
Alternatively, you may want to create an alias for your bsub command in
order to make subsequent runs easier:
```
alias lsfrun='bsub -W 10 -nnodes 1 -P <allocation_ID> -Is jsrun -n1 -a1 -c1
-g1'
lsfrun ./streams
```
```
module load cgpu cuda/11.4.0
nvcc -o streams streams.cu -DUSE_STREAMS
```
To run during the node reservation (10:30-12:30 Pacific time on July 16):
```
module load cgpu cuda/11.4.0
srun -C gpu -N 1 -n 1 -t 10 -A ntrain --reservation=cuda_training -q shared
-G 1 -c 8 ./streams
```
In this case, the output will show the elapsed time of the non-overlapped
version of the code compared to the overlapped version of the code. The
non-overlapped version of the code copies the entire vector to the device,
then launches the processing kernel, then copies the entire vector back to
the host. In the overlapped version, the vector is broken up into chunks,
and then each chunk is copied and processed asynchronously on the GPU
using CUDA streams.
You can also run this code with Nsight Systems if you wish to observe the
overlapping behavior:
On Summit:
```
module load nsight-systems
lsfrun nsys profile -o <destination_dir>/streams.qdrep ./streams
```
On Cori:
```
module load nsight-systems
srun -n 1 nsys profile -o <destination_dir>/streams.qdrep ./streams
```
Note that you will have to copy this file over to your local machine and
install Nsight Systems for visualization. You can download Nsight Systems
here:
https://developer.nvidia.com/nsight-systems
Once you have inserted your OpenMP statement(s), compile and run using
the following instructions.
On Summit:
```
nvcc -Xcompiler -fopenmp -o streams streams.cu -DUSE_STREAMS
export OMP_NUM_THREADS=8
jsrun -n1 -a1 -c8 -bpacked:8 -g1 ./streams
```
On Cori:
```
nvcc -Xcompiler -fopenmp -o streams streams.cu -DUSE_STREAMS
export OMP_NUM_THREADS=8
srun -C gpu -N 1 -n 1 -t 10 -A ntrain --reservation=cuda_training -q shared
-G 1 -c 8 ./streams
```
On Summit:
```
nvcc -Xcompiler -fopenmp -o streams streams.cu -DUSE_STREAMS
export OMP_NUM_THREADS=8
jsrun -n1 -a1 -c8 -bpacked:8 -g4 ./streams
```
On Cori:
```
nvcc -Xcompiler -fopenmp -o streams streams.cu -DUSE_STREAMS
export OMP_NUM_THREADS=8
srun -C gpu -N 1 -n 1 -t 10 -A ntrain --reservation=cuda_training -q shared
-G 4 -c 8 ./streams
```
#include <math.h>
#include <iostream>
#include <time.h>
#include <sys/time.h>
#include <stdio.h>
// modifiable
typedef float ft;
const int chunks = 64;
const size_t ds = 1024*1024*chunks;
const int count = 22;
const int num_streams = 8;
// not modifiable
const float sqrt_2PIf = 2.5066282747946493232942230134974f;
const double sqrt_2PI = 2.5066282747946493232942230134974;
__device__ float gpdf(float val, float sigma) {
return expf(-0.5f * val * val) / (sigma * sqrt_2PIf);
}
// compute average gaussian pdf value over a window around each point
__global__ void gaussian_pdf(const ft * __restrict__ x, ft * __restrict__ y,
const ft mean, const ft sigma, const int n) {
int idx = threadIdx.x + blockDim.x * blockIdx.x;
if (idx < n) {
ft in = x[idx] - (count / 2) * 0.01f;
ft out = 0;
for (int i = 0; i < count; i++) {
ft temp = (in - mean) / sigma;
out += gpdf(temp, sigma);
in += 0.01f;
}
y[idx] = out / count;
}
}
// error check macro
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
}\
} while (0)
// host-based timing
#define USECPSEC 1000000ULL
int main() {
ft *h_x, *d_x, *h_y, *h_y1, *d_y;
cudaHostAlloc(&h_x, ds*sizeof(ft), cudaHostAllocDefault);
cudaHostAlloc(&h_y, ds*sizeof(ft), cudaHostAllocDefault);
cudaHostAlloc(&h_y1, ds*sizeof(ft), cudaHostAllocDefault);
cudaMalloc(&d_x, ds*sizeof(ft));
cudaMalloc(&d_y, ds*sizeof(ft));
cudaCheckErrors("allocation error");
cudaStream_t streams[num_streams];
for (int i = 0; i < num_streams; i++) {
cudaStreamCreate(&streams[i]);
}
cudaCheckErrors("stream creation error");
et1 = dtime_usec(et1);
std::cout << "non-stream elapsed time: " << et1/(float)USECPSEC <<
std::endl;
#ifdef USE_STREAMS
cudaMemset(d_y, 0, ds * sizeof(ft));
et = dtime_usec(et);
return 0;
}
#include <math.h>
#include <iostream>
#include <time.h>
#include <sys/time.h>
#include <stdio.h>
// modifiable
typedef float ft;
const int chunks = 64;
const size_t ds = 1024*1024*chunks;
const int count = 22;
const int num_streams = 8;
// not modifiable
const float sqrt_2PIf = 2.5066282747946493232942230134974f;
const double sqrt_2PI = 2.5066282747946493232942230134974;
__device__ float gpdf(float val, float sigma) {
return expf(-0.5f * val * val) / (sigma * sqrt_2PIf);
}
// compute average gaussian pdf value over a window around each point
__global__ void gaussian_pdf(const ft * __restrict__ x, ft * __restrict__ y,
const ft mean, const ft sigma, const int n) {
int idx = threadIdx.x + blockDim.x * blockIdx.x;
if (idx < n) {
ft in = x[idx] - (count / 2) * 0.01f;
ft out = 0;
for (int i = 0; i < count; i++) {
ft temp = (in - mean) / sigma;
out += gpdf(temp, sigma);
in += 0.01f;
}
y[idx] = out / count;
}
}
// host-based timing
#define USECPSEC 1000000ULL
int main() {
ft *h_x, *d_x, *h_y, *h_y1, *d_y;
cudaHostAlloc(&h_x, ds*sizeof(ft), cudaHostAllocDefault);
cudaHostAlloc(&h_y, ds*sizeof(ft), cudaHostAllocDefault);
cudaHostAlloc(&h_y1, ds*sizeof(ft), cudaHostAllocDefault);
cudaMalloc(&d_x, ds*sizeof(ft));
cudaMalloc(&d_y, ds*sizeof(ft));
cudaCheckErrors("allocation error");
cudaStream_t streams[num_streams];
for (int i = 0; i < num_streams; i++) {
cudaStreamCreate(&streams[i]);
}
cudaCheckErrors("stream creation error");
et1 = dtime_usec(et1);
std::cout << "non-stream elapsed time: " << et1/(float)USECPSEC <<
std::endl;
#ifdef USE_STREAMS
cudaMemset(d_y, 0, ds * sizeof(ft));
et = dtime_usec(et);
for (int i = 0; i < ds; i++) {
if (h_y[i] != h_y1[i]) {
std::cout << "mismatch at " << i << " was: " << h_y[i] << " should be: "
<< h_y1[i] << std::endl;
return -1;
}
}
return 0;
}
# Multi-Process Service
On Cori GPU, first grab an interactive session. Make sure that you request
at least a few slots for MPI, but we'll only need one GPU.
```
module purge
module load cgpu gcc/8.3.0 cuda/11.4.0 openmpi/4.0.3
salloc -A ntrain -q shared --reservation=cuda_mps -C gpu -N 1 -n 4 -t 60 -c
4 --gpus=1
```
The test code used in the lecture is in `test.cu`, and it can be compiled with.
```
nvcc -o test -ccbin=mpicxx test.cu
```
If you're running somewhere where you don't have MPI, you can compile
the application without MPI as follows:
```
nvcc -DNO_MPI -o test test.cu
```
Then in all of the examples below, instead of launching with `mpirun`, use
the provided `run_no_mpi.sh` script, which launches 4 redundant copies of
the same process. This script might also be useful for systems like Summit
where you launch jobs from a different node than the compute node, where
`nsys jsrun ...` is less useful than `jsrun ... nsys`.
Your exercise is to try some of the experiments from the lecture and see if
you can reproduce the findings. Try the following experiments first, without
MPS (note that this application does take about 20 seconds to run, so be
patient):
```
nsys profile --stats=true -t nvtx,cuda -s none -o 1_rank_no_MPS_N_1e9 -f
true mpirun -np 1 ./test 1073741824
nsys profile --stats=true -t nvtx,cuda -s none -o 4_ranks_no_MPS_N_1e9 -f
true mpirun -np 4 ./test 1073741824
```
Verify from both the application stdout and from the profiling data that the
average kernel runtime is longer when using 4 ranks on the same GPU.
Now start MPS and repeat the above experiment with 4 ranks, verifying
that the average kernel runtime is about the same as in the 1 rank case
(again, consult both the stdout and the profiling data).
```
nvidia-cuda-mps-control -d
nsys profile --stats=true -t nvtx,cuda -s none -o 4_ranks_with_MPS_N_1e9
-f true mpirun -np 4 ./test 1073741824
```
Now verify that you can stop MPS and the original behavior returns.
```
echo "quit" | nvidia-cuda-mps-control
nsys profile --stats=true -t nvtx,cuda -s none -o 4_ranks_no_MPS_N_1e9 -f
true mpirun -np 4 ./test 1073741824
```
Vary the problem size `N` until you've found the minimum size where you
can definitively say that MPS provides a clear benefit over the default
compute mode case.
#!/bin/bash
PROBLEM_SIZE=1073741824
NUM_RANKS=4
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
#endif
if (argc >= 2) {
N = atoi(argv[1]);
}
#ifdef NO_MPI
// If not using MPI, specify at command line how many "ranks" there are
int num_ranks = 1;
if (argc >= 3) {
num_ranks = atoi(argv[2]);
}
#endif
double* x;
cudaMalloc((void**) &x, N_per_rank * sizeof(double));
// Number of repetitions
std::cout << "Time per kernel = " << duration.count() / (double) num_reps
<< " ms " << std::endl;
#ifndef NO_MPI
MPI_Finalize();
#endif
}
# **Task 1**
First, compile the code as follows, and run the code to observe the reported
behavior:
```
module load cuda
nvcc -arch=sm_70 task1.cu -o task1 -lineinfo
```
We are compiling the code for the GPU architecture being used (Volta SM
7.0 in this case) and we are also compiling with --lineinfo switch. You know
as a CUDA support engineer that this will be a useful switch when it comes
to using compute-sanitizer.
```
bsub -W 10 -nnodes 1 -P <allocation_ID> -Is jsrun -n1 -a1 -c1 -g1 ./task1
```
Alternatively, you may want to create an alias for your bsub command in
order to make subsequent runs easier:
```
alias lsfrun='bsub -W 10 -nnodes 1 -P <allocation_ID> -Is jsrun -n1 -a1 -c1
-g1'
lsfrun ./task1
```
```
module load cgpu cuda/11.4.0
nvcc -arch=sm_70 task1.cu -o task1 -lineinfo
```
```
Success!
```
## Part A
Use basic *compute-sanitizer* functionality (no additional switches) to
identify a problem in the code. Using the output from *compute-sanitizer*,
identify the offending line of code. Fix this issue.
Hints:
- Remember that *-lineinfo* will cause compute-sanitizer (in this usage) to
report the actual line of code that is causing the problem
- Even if you didn't have this information (line number) could you use other
compute sanitizer information to quickly deduce the line to focus on in this
case? You could use the type of memory access violation as a clue.
Which lines of code in the kernel are doing that type of memory access
(hint, there is only one line of kernel code that is doing this.)
- Memory access problems are often caused by indexing errors. See if
you can spot an indexing error that may lead to this issue (hint - the classic
computer science "off by one" error.)
- Refer to *task1_solution.cu* if you get stuck
## Part B
Yay! You sorted out the problem, made the change to indexing, and now
the code prints "Success!" It's time to send the user on their way. Or is it?
Could there be other errors? Use additional compute-sanitizer switches
(*--tool racecheck*, *--tool initcheck*, *--tool synccheck*) to identify other
"latent" issues. Fix them.
Hints:
- The only tool that should report a problem at this point is the racecheck
tool.
- See if you can use the line number information embedded in the error
reports to identify the trouble "zone" in the kernel code
- Since you know that the racecheck tool reports race issues with shared
memory usage (only), and that these often involve missing synchronization,
can you identify the right place to insert appropriate synchronization into
the kernel code? Try experimenting. Inserting additional synchronization
into a CUDA kernel code usually does not break code correctness.
- Refer to *task1_solution.cu* if you get stuck
# **Task 2**
In this task we will explore basic usage of cuda-gdb. Once again you are
providing user support at a cluster help desk. The user has a code that
produces a *-inf* (negative floating-point infinity) result, and that is not
expected. The code consists of a transformation operation (one data
element created/modified per thread) followed by a reduction operation
(per-thread results summed together). The output of the reduction is *-inf*.
See if you can use *cuda-gdb* to identify the problem and rectify it.
```
nvcc -arch=sm_70 task2.cu -o task2 -G -g -std=c++14
```
On Summit:
```
jsrun -n1 -a1 -c1 -g1 cuda-gdb ./task2
```
On Cori:
```
srun -n 1 ./task2
```
Don't forget that you cannot inspect device data until you are stopped after
a device-code breakpoint.
Once you have identified the source of the issue, see if you can propose a
simple code modification to work around the issue. If you get stuck on this
part (proposing a solution), refer to the *task2_solution.cu*. Careful code
inspection will likely immediately point out the issue, however the purpose
of this task is not actually to fix the code this way, but to learn to use
*cuda-gdb*.
Hints:
- The code is attempting to estimate the sum of an alternating harmonic
series (ahs), whose sum should be equal to the natural log of 2.
- The code is broken into two parts: the ahs term generator (produced by
the device function ahs) which takes only the index of the term to generate,
and a standard sweep parallel reduction, similar to the content in session 5
of this training series.
- Generally speaking, floating point arithmetic on *inf* or *-inf* inputs will
produce a *inf* or *-inf* output
- Decide whether you think the *-inf* is likely to appear as a result of the
initial transformation operation, or the subsequent reduction operation
- Use this reasoning to choose a point for an initial breakpoint
- Inspect data to see if you can observe *-inf* in any of the intermediate
data
- Use this observation to repeat the process of setting a breakpoint and
inspecting data
- Alternatively, work linearly through the code, setting an initial breakpoint
and single-stepping, to see if you can observe incorrect data
- You may need to change thread focus or observe data belonging to other
threads
- The reduction also offers the opportunity to tackle this problem via
divide-and-conquer, or binary searching
- Consider reducing the problem size (i.e. length of terms to generate the
estimate) to simplify your debug effort
#include <iostream>
// Thread block size
#define BLOCK_SIZE 32
// Matrices are stored in row-major order:
// M(row, col) = *(M.elements + row * M.stride + col)
typedef struct {
int width;
int height;
int stride;
float* elements;
} Matrix;
// Invoke kernel
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid(B.width / dimBlock.x, A.height / dimBlock.y);
MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);
// Read C from device memory
cudaMemcpy(C.elements, d_C.elements, size,
cudaMemcpyDeviceToHost);
int main(){
const int num_m = 3; // we need 3 matrices
const int side_dim = 128; // side dimension of square matrix
Matrix *m = new Matrix[num_m]; // allocate matrix storage part 1
for (int i = 0; i < num_m; i++){
m[i].width = m[i].height = m[i].stride = side_dim; // set matrix params
m[i].elements = new float[side_dim*side_dim]; // allocate matrix
storage part 2
if (i < 2) // initialize first two matrices
for (int j = 0; j < side_dim*side_dim; j++) m[i].elements[j] = 1.0f; }
MatMul(m[0], m[1], m[2]); // perform matrix-multiply
std::cout << cudaGetErrorString(cudaGetLastError()) << std::endl;
for (int i = 0; i < side_dim*side_dim; i++) // perform results checking
if (m[2].elements[i] != (float)side_dim) {std::cout << "Mismatch at
index: " << i << " expected: " << (float)side_dim << " got: " <<
m[2].elements[i] << std::endl; return 0;}
std::cout << "Success!" << std::endl;
for (int i = 0; i < num_m; i++)
delete[] m[i].elements;
delete[] m;
return 0;
}
#include <iostream>
// Thread block size
#define BLOCK_SIZE 32
// Invoke kernel
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid(B.width / dimBlock.x, A.height / dimBlock.y);
MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);
int main(){
const int num_m = 3; // we need 3 matrices
const int side_dim = 128; // side dimension of square matrix
Matrix *m = new Matrix[num_m]; // allocate matrix storage part 1
for (int i = 0; i < num_m; i++){
m[i].width = m[i].height = m[i].stride = side_dim; // set matrix params
m[i].elements = new float[side_dim*side_dim]; // allocate matrix
storage part 2
if (i < 2) // initialize first two matrices
for (int j = 0; j < side_dim*side_dim; j++) m[i].elements[j] = 1.0f; }
MatMul(m[0], m[1], m[2]); // perform matrix-multiply
std::cout << cudaGetErrorString(cudaGetLastError()) << std::endl;
for (int i = 0; i < side_dim*side_dim; i++) // perform results checking
if (m[2].elements[i] != (float)side_dim) {std::cout << "Mismatch at
index: " << i << " expected: " << (float)side_dim << " got: " <<
m[2].elements[i] << std::endl; return 0;}
std::cout << "Success!" << std::endl;
for (int i = 0; i < num_m; i++)
delete[] m[i].elements;
delete[] m;
return 0;
}
#include <iostream>
#include <cstdlib>
#include <cmath>
#include <cstdio>
You can refer to the solutions in the Solutions directory for help/hints when
stuck.
### Task 1
#### Stream Capture
This task will be an example of how to use stream capture with Cuda
Graphs. We will be creating a graph from a sequence of kernel launchs
across two streams.

This is the same example from the slides, feel free to refer to them for help
and hints.
Go ahead and take a look at the code now to get a sense of the new Graph
API calls. On first pass, ignore the Graph APIs and try get a feel for the
underlying code and what it is doing. The kernels themselves are not doing
any specific math, but simply represent some random small kernel.
Remember to think about the function of the two streams and refer back to
the picture here to make sure you see the inherient dependencies created
by the Cuda Events.
`bool graphCreated=false;` will be our method to set up the graph on the
first pass only (for loop iteration 0), then go straight to launching the graph
in each subsequent iteration (1 - (N-1)).
```
module load cuda/11.4.0
nvcc -arch=sm_70 axpy_stream_capture_with_fixme.cu -o
axpy_stream_capture_with_fixme
```
We are compiling the code for the GPU architecture being used (Volta SM
7.0 in this case). Cuda Graphs has been included in all Cuda Toolkits after
Cuda 10, but some features may be version-dependent.
```
bsub -W 10 -nnodes 1 -P <allocation_ID> -Is jsrun -n1 -a1 -c1 -g1
./axpy_stream_capture_with_fixme
```
Alternatively, you may want to create an alias for your bsub command in
order to make subsequent runs easier:
```
alias lsfrun='bsub -W 10 -nnodes 1 -P <allocation_ID> -Is jsrun -n1 -a1 -c1
-g1'
lsfrun ./axpy_stream_capture_with_fixme
```
```
module load cgpu cuda/11.4.0
nvcc -arch=sm_70 axpy_stream_capture_with_fixme.cu -o
axpy_stream_capture_with_fixme
```
FIXMEs
1. cudaGraphCreate(FIXME, 0);
2. cudaGraphInstantiate(FIXME, graph, NULL, NULL, 0);
3. graphCreated = FIXME;
4. cudaGraphLaunch(FIXME, streams[0]);
After you have complete the FIXME, you can see a time printed out after
you run. This is the total time from running the graph 1000 times. You can
compare that to the time from file axpy_stream_capture_timer.cu, which is
the same code running the Cuda work in streams instead of the graph.
These examples are primarily to introduce the topic and API, so they are
not particularly performant. Given this, you should still be able to see a
small preformance increase using the graph from the launch overhead
savings. The instantiation phase is not included in the timing however, so it
is not exactly a apple-to-apples comparison. It merely highlights the ideas
we saw in the slides.
### Task 2
#### Explicit Graph Creation w/ Library Call
In this task, we will look at a few of the explicit graph creation API and how
to capture a library call with stream capture. A key to this example is
remembering while we are using both explicit graph creation and stream
capture, both are just ways of defining to a `cudaGraph_t` which we then
instantiate into a `cudaGraphExec_t`.
We are creating 2 kernel nodes and a child graph derived from a cuBLAS
axpy function call. See the diagram below for a visual.

https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.
html
The API is a bit tricky because it is quite different from anything else in
Cuda at first, but the patterns are actually quite familar. It is just a different
way to define Cuda work.
We will follow the same instructions as before to compile, plus this time
adding -lcublas to include the library.
```
nvcc -arch=sm_70 -lcublas axpy_cublas_with_fixme.cu -o
axpy_cublas_with_fixme
```
```
lsfrun ./axpy_cublas_with_fixme
```
```
srun -n 1 ./axpy_stream_capture_with_fixme
```
FIXME
1. cudaGraphCreate(FIXME, 0);
2. cudaGraphAddChildGraphNode(FIXME, graph, FIXME,
nodeDependencies.size(), libraryGraph);
3. cudaGraphLaunch(FIXME, stream1);
#include <stdio.h>
#include <vector>
#include <cuda_runtime_api.h>
#include <cublas_v2.h>
#define N 500000
__global__
void kernel_c(float* x, float* y){
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < N) y[idx] += 1;
}
int main(){
cudaStream_t stream1;
cudaStreamCreateWithFlags(&stream1, cudaStreamNonBlocking);
cublasHandle_t cublas_handle;
cublasCreate(&cublas_handle);
cublasSetStream(cublas_handle, stream1);
// Set up graph
cudaGraph_t graph; // main graph
cudaGraph_t libraryGraph; // sub graph for cuBLAS call
std::vector<cudaGraphNode_t> nodeDependencies;
cudaGraphNode_t kernelNode1, kernelNode2, libraryNode;
// Library call
cublasSaxpy(cublas_handle, N, &d_a, d_x, 1, d_y, 1);
cudaCheckErrors("cublasSaxpy failure");
cudaStreamEndCapture(stream1, &libraryGraph);
cudaCheckErrors("Stream capture end failure");
cudaGraphAddChildGraphNode(&libraryNode, graph,
nodeDependencies.data(),
nodeDependencies.size(), libraryGraph);
cudaCheckErrors("Adding libraryNode failed");
nodeDependencies.clear();
nodeDependencies.push_back(libraryNode); // manage dependency vector
nodeDependencies.clear();
nodeDependencies.push_back(kernelNode2); // manage dependency
vector
cudaGraphExec_t instance;
cudaGraphInstantiate(&instance, graph, NULL, NULL, 0);
cudaCheckErrors("Graph instantiation failed");
cudaDeviceSynchronize();
return 0;
}
#include <stdio.h>
#include <cuda_runtime_api.h>
#include <ctime>
#include <ratio>
#include <chrono>
#include <iostream>
#define N 500000
__global__
void kernel_b(float * x, float * y){
int idx = blockIdx.x*blockDim.x + threadIdx.x;
if (idx < N) y[idx] = 2.0*x[idx] + y[idx];
__global__
void kernel_c(float * x, float * y){
int idx = blockIdx.x*blockDim.x + threadIdx.x;
if (idx < N) y[idx] = 2.0*x[idx] + y[idx];
__global__
void kernel_d(float * x, float * y){
int idx = blockIdx.x*blockDim.x + threadIdx.x;
if (idx < N) y[idx] = 2.0*x[idx] + y[idx];
int main(){
cudaEventCreateWithFlags(&event1, cudaEventDisableTiming);
cudaEventCreateWithFlags(&event2, cudaEventDisableTiming);
// Set up graph
bool graphCreated=false;
cudaGraph_t graph;
cudaGraphExec_t instance;
cudaGraphCreate(&graph, 0);
// Launching work
for (int i = 0; i < 100; ++i){
if (graphCreated == false){
// If first pass, starting stream capture
cudaStreamBeginCapture(streams[0],
cudaStreamCaptureModeGlobal);
cudaCheckErrors("Stream begin capture failed");
cudaEventRecord(event1, streams[0]);
cudaCheckErrors("Event record failed");
cudaStreamWaitEvent(streams[1], event1);
cudaCheckErrors("Event wait failed");
cudaEventRecord(event2, streams[1]);
cudaCheckErrors("Event record failed");
cudaStreamWaitEvent(streams[0], event2);
cudaCheckErrors("Event wait failed");
kernel_d<<<blocks, threads, 0, streams[0]>>>(d_x, d_y);
cudaCheckErrors("Kernel d failed");
cudaStreamEndCapture(streams[0], &graph);
cudaCheckErrors("Stream end capture failed");
graphCreated = true;
}
// Launch the graph instance
cudaGraphLaunch(instance, streams[0]);
cudaCheckErrors("Launching graph failed");
cudaStreamSynchronize(streams[0]);
}
high_resolution_clock::time_point t1 = high_resolution_clock::now();
cudaDeviceSynchronize();
high_resolution_clock::time_point t2 = high_resolution_clock::now();
std::cout << "Time " << total_time.count() << " s" << std::endl;
cudaDeviceSynchronize();
return 0;
}
__global__ void transposeCoalesced(float *odata, const float *idata)
{
__shared__ float tile[TILE_DIM][TILE_DIM];
__syncthreads();