002 - Introduction To CUDA Programming - 1
002 - Introduction To CUDA Programming - 1
6.4GB/sec – 31.92GB/sec
8B per transfer
GPU Memory
Memory
1GB on our systems
Target Applications
int a[N]; // N is large
for all elements of a compute
a[i] = a[i] * fade
• Lots of independent computations
– CUDA thread need not be independent
Programmer’s View of the GPU
Device
other by:
Grid 2
– Synchronizing their execution
Kernel
• For hazard-free shared memory 2
accesses
– Efficiently sharing data through Block (1, 1)
a low latency shared memory
Thread Thread Thread Thread Thread
• Two threads from two (0, 0) (1, 0) (2, 0) (3, 0) (4, 0)
• Global memory
– Main means of communicating R/W Data between
host and device
– Contents visible to all threads
e.g., fft()
cuda…()
cu…()
Reasoning about CUDA call ordering
• GPU communication via cuda…() calls and
kernel invocations
– cudaMalloc, cudaMemCpy,
• Asynchronous from the CPU’s perspective
– CPU places a request in a “CUDA” queue
– requests are handled in-order
• Streams allow for multiple queues
– More on this much later one
CUDA API: Example
int a[N];
for (i =0; i < N; i++)
a[i] = a[i] + x;
1. Allocate CPU Data Structure
2. Initialize Data on CPU
3. Allocate GPU Data Structure
4. Copy Data from CPU to GPU
5. Define Execution Configuration
6. Run Kernel
7. CPU synchronizes with GPU
8. Copy Data from GPU to CPU
9. De-allocate GPU and CPU memory
1. Allocate CPU Data
float *ha;
...
}
int i;
float *da;
float *da;
float *ha;
• enum cudaMemcpyKind
– cudaMemcpyHostToDevice
– cudaMemcpyDeviceToHost
– cudaMemcpyDeviceToDevice
5. Define Execution Configuration
• How many blocks and threads/block
• Alternatively:
blocks = (N + threads_block – 1) /
threads_block;
6. Launch Kernel & 7. CPU/GPU Synchronization
• cudaThreadSynchronize ()
– Block CPU until all preceding cuda…() and kernel requests
have completed
8. Copy data from GPU to CPU & 9. DeAllocate Memory
float *da;
float *ha;
cudaFree (da);
// display or process results here
free (ha);
The GPU Kernel
__global__ darradd (float *da, float x, int N)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
0
63
0
63
63
0
x
x
.x
.x
x.
.x
x.
.x
.x
dx
dx
Id
dx
Id
dx
dx
ad
I
I
ad
ad
ad
I
I
ad
ad
ad
re
re
re
re
re
re
re
th
th
th
th
th
th
th
i=0 i = 63 i = 64 i = 127 i = 128 i = 255 i = 256
Assuming blockDim.x = 64
Generic Unique Thread and Block Index Calculations #1
• 1D Grid / 1D Blocks:
UniqueBlockIndex = blockIdx.x;
UniqueThreadIndex = blockIdx.x * blockDim.x +
threadIdx.x;
• 1D Grid / 2D Blocks:
UniqueBlockIndex = blockIdx.x;
UniqueThreadIndex = blockIdx.x * blockDim.x * blockDim.y
+ threadIdx.y * blockDim.x + threadIdx.x;
• 1D Grid / 3D Blocks:
UniqueBockIndex = blockIdx.x;
UniqueThreadIndex = blockIdx.x * blockDim.x * blockDim.y
* blockDim.z + threadIdx.z * blockDim.y * blockDim.x +
threadIdx.y * blockDim.x + threadIdx.x;
• Source: http://forums.nvidia.com/lofiversion/index.php?t82040.html
Generic Unique Thread and Block Index Calculations #2
• 2D Grid / 1D Blocks:
• 2D Grid / 2D Blocks:
• 2D Grid / 3D Blocks:
• uint3 blockIdx
– Block ID, in 2D (blockIdx.z = 1 always)
• dim3 blockDim
– Number of threads per block, in 3D
• uint3 threadIdx
– Thread ID in block, in 3D
Execution Configuration Examples
• 1D grid / 1D blocks
dim3 gd(1024)
dim3 bd(64)
akernel<<<gd, bd>>>(...)
gridDim.x = 1024, gridDim.y = 1,
blockDim.x = 64, blockDim.y = 1, blockDim.z = 1
• 2D grid / 3D blocks
dim3 gd(4, 128)
dim3 bd(64, 16, 4)
akernel<<<gd, bd>>>(...)
gridDim.x = 4, gridDim.y = 128,
blockDim.x = 64, blockDim.y = 16, blockDim.z
= 4
Error Handling
• Most cuda…() functions return a cudaError_t
– If cudaSuccess: Request completed without a problem
• cudaGetLastError():
– returns the last error to the CPU
– Use with cudaThreadSynchronize():
cudaError_t code;
cudaThreadSynchronize ();
code = cudaGetLastError ();
cutCreateTimer (&htimer);
CudaThreadSynchronize ();
cutStartTimer(htimer);
WHAT WE ARE INTERESTED IN
cudaThreadSynchronize ();
cutStopTimer(htimer);
printf (“time: %f\n", cutGetTimerValue(htimer));
Code Overview: Host side
#include <cuda.h>
#include <cutil.h>
unsigned int htimer;
float *ha, *da;
main (int argc, char *argv[]) {
int N = atoi (argv[1]);
ha = (float *) malloc (sizeof (float) * N);
for (int i = 0; i < N; i++) ha[i] = i;
cutCreateTimer (&htimer);
cudaMalloc ((void **) &da, sizeof (float) * N);
cudaMemCpy ((void *) da, (void *) ha, sizeof (float) * N,
cudaMemcpyHostToDevice);
cudaThreadSynchronize ();
cutStartTimer(htimer);
blocks = (N + threads_block – 1) / threads_block;
darradd <<<blocks, threads_block>> (da, 10f, N)
cudaThreadSynchronize ();
cutStopTimer(htimer);
cudaMemCpy ((void *) ha, (void *) da, sizeof (float) * N,
cudaMemcpyDeviceToHost);
cudaFree (da);
free (ha);
printf (“processing time: %f\n", cutGetTimerValue(htimer));
}
Code Overview: Device Side
__device__ float addmany (float a, float b, int count)
{
while (count--) a += b;
return a;
}
• __device__
– stored in device memory (large, high latency, no cache)
– Allocated with cudaMalloc (__device__qualifier implied)
– accessible by all threads
– lifetime: application
• __constant__
– same as __device__, but cached and read-only by GPU
– written by CPU via cudaMemcpyToSymbol(...) call
– lifetime: application
• __shared__
– stored in on-chip shared memory (very low latency)
– accessible by all threads in the same thread block
– lifetime: kernel launch
• Unqualified variables:
– scalars and built-in vector types are stored in registers
– arrays of more than 4 elements or run-time indices stored in device memory
Measurement Methodology
• You will not get exactly the same time
measurements every time
– Other processes running / external events (e.g., network
activity)
– Cannot control
– “Non-determinism”
• Must take sufficient samples
– say 10 or more
– There is theory on what the number of samples must be
• Measure average
• Will discuss this next time or will provide a handout
online
Handling Large Input Data Sets – 1D Example
• Recall gridDim.[xy] <= 65535
• Host calls kernel multiple times:
float *dac = da; // starting offset for current kernel
while (n_blocks)
{
int bn = n_blocks;
int elems; // array elements processed in this kernel
if (bn > 65535) bn = 65535;
elems = bn * block_size;
darradd <<<bn, block_size>>> (dac, 10.0f, elems);
n_blocks -= bn;
dac += elems;
}