GPU Computing With CUDA Lecture 3 - Efficient Shared Memory Use
GPU Computing With CUDA Lecture 3 - Efficient Shared Memory Use
Christopher Cooper
Boston University
August, 2011
UTFSM, Valparaso, Chile
1
Outline of lecture
Recap of Lecture 2
Shared memory in detail
Tiling
Bank conflicts
Recap
Thread hierarchy
- Thread are grouped in thread blocks
- Threads of the same block are executed on the same SM at the same
time
Threads can communicate with shared memory
An SM can have up to 8 blocks at the same time
Recap
Memory hierarchy
Smart use of
memory hierarchy!
4
Recap
Programming model: Finite Difference case
- One node per thread
- Node indexing automatically groups into thread blocks!
Recap
Programming model: Finite Difference case
- One node per thread
- Node indexing automatically groups into thread blocks!
Thread = node
Thread block
Shared Memory
Small (48kB per SM)
Fast (~4 cycles): On-chip
Private to each block
- Allows thread communication
u
u
=c
t
x
n+1
ui
n
ui
ct n
n
(ui ui1 )
x
__global__voidupdate(float*u,float*u_prev,intN,floatdx,floatdt,floatc,int
BLOCKSIZE)
{
//Eachthreadwillloadoneelement
inti=threadIdx.x+BLOCKSIZE*blockIdx.x;
if(i>=N){return;}
u_prev[i]=u[i];
if(i>0)
{u[i]=u_prev[i]c*dt/dx*(u_prev[i]u_prev[i1]);
}
}
u
u
=c
t
x
n+1
ui
n
ui
ct n
n
(ui ui1 )
x
__global__voidupdate(float*u,float*u_prev,intN,floatdx,floatdt,floatc,int
BLOCKSIZE)
{
//Eachthreadwillloadoneelement
Thread i
inti=threadIdx.x+BLOCKSIZE*blockIdx.x;
if(i>=N){return;}
u_prev[i]=u[i];
if(i>0)
{u[i]=u_prev[i]c*dt/dx*(u_prev[i]u_prev[i1]);
}
}
u
u
=c
t
x
n+1
ui
n
ui
ct n
n
(ui ui1 )
x
__global__voidupdate(float*u,float*u_prev,intN,floatdx,floatdt,floatc,int
BLOCKSIZE)
{
//Eachthreadwillloadoneelement
Thread i
inti=threadIdx.x+BLOCKSIZE*blockIdx.x;
if(i>=N){return;}
u_prev[i]=u[i];
Loads
element i
if(i>0)
{u[i]=u_prev[i]c*dt/dx*(u_prev[i]u_prev[i1]);
}
}
u
u
=c
t
x
n+1
ui
n
ui
ct n
n
(ui ui1 )
x
__global__voidupdate(float*u,float*u_prev,intN,floatdx,floatdt,floatc,int
BLOCKSIZE)
{
//Eachthreadwillloadoneelement
Thread i
inti=threadIdx.x+BLOCKSIZE*blockIdx.x;
if(i>=N){return;}
u_prev[i]=u[i];
Loads
element i
Loads
element i-1
if(i>0)
{u[i]=u_prev[i]c*dt/dx*(u_prev[i]u_prev[i1]);
}
}
u
u
=c
t
x
n+1
ui
n
ui
ct n
n
(ui ui1 )
x
__global__voidupdate(float*u,float*u_prev,intN,floatdx,floatdt,floatc,int
BLOCKSIZE)
{
//Eachthreadwillloadoneelement
inti=threadIdx.x+BLOCKSIZE*blockIdx.x;
if(i>=N){return;}
u_prev[i]=u[i];
if(i>0)
{u[i]=u_prev[i]c*dt/dx*(u_prev[i]u_prev[i1]);
}
}
u
u
=c
t
x
n+1
ui
n
ui
ct n
n
(ui ui1 )
x
__global__voidupdate(float*u,float*u_prev,intN,floatdx,floatdt,floatc,int
BLOCKSIZE)
{
//Eachthreadwillloadoneelement
Thread i +1
inti=threadIdx.x+BLOCKSIZE*blockIdx.x;
if(i>=N){return;}
u_prev[i]=u[i];
if(i>0)
{u[i]=u_prev[i]c*dt/dx*(u_prev[i]u_prev[i1]);
}
}
u
u
=c
t
x
n+1
ui
n
ui
ct n
n
(ui ui1 )
x
__global__voidupdate(float*u,float*u_prev,intN,floatdx,floatdt,floatc,int
BLOCKSIZE)
{
//Eachthreadwillloadoneelement
Thread i +1
inti=threadIdx.x+BLOCKSIZE*blockIdx.x;
if(i>=N){return;}
u_prev[i]=u[i];
Loads
element i+1
if(i>0)
{u[i]=u_prev[i]c*dt/dx*(u_prev[i]u_prev[i1]);
}
}
u
u
=c
t
x
n+1
ui
n
ui
ct n
n
(ui ui1 )
x
__global__voidupdate(float*u,float*u_prev,intN,floatdx,floatdt,floatc,int
BLOCKSIZE)
{
//Eachthreadwillloadoneelement
Thread i +1
inti=threadIdx.x+BLOCKSIZE*blockIdx.x;
if(i>=N){return;}
u_prev[i]=u[i];
Loads
element i+1
Loads
element i
if(i>0)
{u[i]=u_prev[i]c*dt/dx*(u_prev[i]u_prev[i1]);
}
}
u
u
=c
t
x
n+1
ui
n
ui
ct n
n
(ui ui1 )
x
__global__voidupdate(float*u,float*u_prev,intN,floatdx,floatdt,floatc,int
BLOCKSIZE)
{
//Eachthreadwillloadoneelement
Thread i +1
inti=threadIdx.x+BLOCKSIZE*blockIdx.x;
if(i>=N){return;}
u_prev[i]=u[i];
Loads
element i+1
Loads
element i
if(i>0)
{u[i]=u_prev[i]c*dt/dx*(u_prev[i]u_prev[i1]);
}
}
__global__voidupdate(float*u,float*u_prev,intN,floatdx,floatdt,floatc)
{
//Eachthreadwillloadoneelement
inti=threadIdx.x;
intI=threadIdx.x+BLOCKSIZE*blockIdx.x;
__shared__floatu_shared[BLOCKSIZE];
if(I>=N){return;}
u_shared[i]=u[I];
__syncthreads();
if(I>0)
{u[I]=u_shared[i]c*dt/dx*(u_shared[i]u_shared[i1]);}
}
__global__voidupdate(float*u,float*u_prev,intN,floatdx,floatdt,floatc)
{
//Eachthreadwillloadoneelement
inti=threadIdx.x;
intI=threadIdx.x+BLOCKSIZE*blockIdx.x;
__shared__floatu_shared[BLOCKSIZE];
Allocate shared array
if(I>=N){return;}
u_shared[i]=u[I];
__syncthreads();
if(I>0)
{u[I]=u_shared[i]c*dt/dx*(u_shared[i]u_shared[i1]);}
}
__global__voidupdate(float*u,float*u_prev,intN,floatdx,floatdt,floatc)
{
//Eachthreadwillloadoneelement
inti=threadIdx.x;
intI=threadIdx.x+BLOCKSIZE*blockIdx.x;
__shared__floatu_shared[BLOCKSIZE];
Allocate shared array
if(I>=N){return;}
u_shared[i]=u[I];
Load to shared mem
__syncthreads();
if(I>0)
{u[I]=u_shared[i]c*dt/dx*(u_shared[i]u_shared[i1]);}
}
__global__voidupdate(float*u,float*u_prev,intN,floatdx,floatdt,floatc)
{
//Eachthreadwillloadoneelement
inti=threadIdx.x;
intI=threadIdx.x+BLOCKSIZE*blockIdx.x;
__shared__floatu_shared[BLOCKSIZE];
Allocate shared array
if(I>=N){return;}
u_shared[i]=u[I];
Load to shared mem
__syncthreads();
if(I>0)
{u[I]=u_shared[i]c*dt/dx*(u_shared[i]u_shared[i1]);}
}
__global__voidupdate(float*u,float*u_prev,intN,floatdx,floatdt,floatc)
{
//Eachthreadwillloadoneelement
inti=threadIdx.x;
intI=threadIdx.x+BLOCKSIZE*blockIdx.x;
__shared__floatu_shared[BLOCKSIZE];
Allocate shared array
if(I>=N){return;}
u_shared[i]=u[I];
Load to shared mem
__syncthreads();
if(I>0)
{u[I]=u_shared[i]c*dt/dx*(u_shared[i]u_shared[i1]);}
}
if(I>=N){return;}
u_prev[I]=u[I];
u_shared[i]=u[I];
__syncthreads();
if(i>0&&i<BLOCKSIZE1)
{u[I]=u_shared[i]c*dt/dx*(u_shared[i]u_shared[i1]);
}
else
{u[I]=u_prev[I]c*dt/dx*(u_prev[I]u_prev[I1]);
}
if(I>=N){return;}
u_prev[I]=u[I];
u_shared[i]=u[I];
__syncthreads();
if(i>0&&i<BLOCKSIZE1)
{u[I]=u_shared[i]c*dt/dx*(u_shared[i]u_shared[i1]);
}
else
{u[I]=u_prev[I]c*dt/dx*(u_prev[I]u_prev[I1]);
}
if(I>=N){return;}
u_prev[I]=u[I];
u_shared[i]=u[I];
__syncthreads();
if(i>0&&i<BLOCKSIZE1)
{u[I]=u_shared[i]c*dt/dx*(u_shared[i]u_shared[i1]);
}
else
{u[I]=u_prev[I]c*dt/dx*(u_prev[I]u_prev[I1]);
}
u
u
= 2
t
x
Explicit scheme
un+1
i,j
uni,j
k n
+ 2 (ui,j+1 + uni,j1 + uni+1,j + uni1,j 4uni,j )
h
T=0
T = 200
T=0
T = 200
10
Nx*(Ny-1)
I increasing
Nx*Ny-1
I increasing
Nx-1
11
slow
__global__voidupdate(float*u,float*u_prev,intN,floath,floatdt,float
alpha,intBSZ)
{
//Settingupindices
inti=threadIdx.x;
intj=threadIdx.y;
intI=blockIdx.y*BSZ*N+blockIdx.x*BSZ+j*N+i;
if(I>=N*N){return;}
u_prev[I]=u[I];
//ifnotboundarydo
if((I>N)&&(I<N*N1N)&&(I%N!=0)&&(I%N!=N1))
{u[I]=u_prev[I]+alpha*dt/(h*h)*(u_prev[I+1]+u_prev[I1]+
u_prev[I+N]+u_prev[IN]4*u_prev[I]);
}
}
12
Global memory
Advantage
- Easy to implement
Disadvantage
- Branching statement
- Still have some redundant loads
Shared memory
13
//ifnotonblockboundarydo
if(block_check)
{u[I]=u_prev_sh[i][j]+alpha*dt/h/h*(u_prev_sh[i+1][j]+u_prev_sh[i1]
[j]+u_prev_sh[i][j+1]+u_prev_sh[i][j1]4*u_prev_sh[i][j]);
}
//ifnotonboundary
elseif(bound_check)
{u[I]=u_prev[I]+alpha*dt/(h*h)*(u_prev[I+1]+u_prev[I1]+u_prev[I+N]
+u_prev[IN]4*u_prev[I]);
}
14
}
BSZ
BSZ -2
Operate on internal
nodes
BSZ
BSZ -2
16
if(I>=N||J>=N){return;}
__shared__floatu_prev_sh[BSZ][BSZ];
u_prev_sh[i][j]=u[Index];
__syncthreads();
boolbound_check=((I!=0)&&(I<N1)&&(J!=0)&&(J<N1));
boolblock_check=((i!=0)&&(i<BSZ1)&&(j!=0)&&(j<BSZ1));
if(bound_check&&block_check)
{u[Index]=u_prev_sh[i][j]+alpha*dt/h/h*(u_prev_sh[i+1][j]+
u_prev_sh[i1][j]+u_prev_sh[i][j+1]+u_prev_sh[i][j1]4*u_prev_sh[i][j]);
}
}
17
BSZ+2
BSZ
BSZ+2
BSZ
- Operate on [i+1][j+1]
threads
19
8x8 threads
10x10 loads
intI_n2=I_0+J2*N+I2;//Generalindex
if((I2<(BSZ+2))&&(J2<(BSZ+2))&&(ii2<N*N))
u_prev_sh[I2][J2]=u[I_n2];
20
I0
8x8 threads
10x10 loads
intI_n2=I_0+J2*N+I2;//Generalindex
if((I2<(BSZ+2))&&(J2<(BSZ+2))&&(ii2<N*N))
u_prev_sh[I2][J2]=u[I_n2];
20
64th
__shared__floatu_prev_sh[BSZ+2][BSZ+2];
intii=j*BSZ+i,//Flattenthreadindexing
I=ii%(BSZ+2),//xdirectionindexincludinghalo
J=ii/(BSZ+2);//ydirectionindexincludinghalo
intI_n=I_0+J*N+I;//Generalindex
u_prev_sh[I][J]=u[I_n];
I0
8x8 threads
10x10 loads
intI_n2=I_0+J2*N+I2;//Generalindex
if((I2<(BSZ+2))&&(J2<(BSZ+2))&&(ii2<N*N))
u_prev_sh[I2][J2]=u[I_n2];
20
64th
__shared__floatu_prev_sh[BSZ+2][BSZ+2];
intii=j*BSZ+i,//Flattenthreadindexing
I=ii%(BSZ+2),//xdirectionindexincludinghalo
J=ii/(BSZ+2);//ydirectionindexincludinghalo
intI_n=I_0+J*N+I;//Generalindex
u_prev_sh[I][J]=u[I_n];
I0
8x8 threads
10x10 loads
intI_n2=I_0+J2*N+I2;//Generalindex
if((I2<(BSZ+2))&&(J2<(BSZ+2))&&(ii2<N*N))
u_prev_sh[I2][J2]=u[I_n2];
20
I0
8x8 threads
10x10 loads
intI_n2=I_0+J2*N+I2;//Generalindex
if((I2<(BSZ+2))&&(J2<(BSZ+2))&&(ii2<N*N))
u_prev_sh[I2][J2]=u[I_n2];
20
I0
8x8 threads
10x10 loads
intI_n2=I_0+J2*N+I2;//Generalindex
if((I2<(BSZ+2))&&(J2<(BSZ+2))&&(ii2<N*N))
u_prev_sh[I2][J2]=u[I_n2];
20
I0
8x8 threads
10x10 loads
intI_n2=I_0+J2*N+I2;//Generalindex
if((I2<(BSZ+2))&&(J2<(BSZ+2))&&(ii2<N*N))
u_prev_sh[I2][J2]=u[I_n2];
Index
intIndex=by*BSZ*N+bx*BSZ+(j+1)*N+i+1;
u[Index]=u_prev_sh[i+1][j+1]+alpha*dt/h/h*(u_prev_sh[i+2][j+1]+u_prev_sh[i][j+1]+
u_prev_sh[i+1][j+2]+u_prev_sh[i+1][j]4*u_prev_sh[i+1][j+1]);
21
SM Implementation
The technique described is called tiling
- Tiling means loading data to shared memory in tiles
- Useful when shared memory is used as cache
- Also used when all data is to large to fit in shared memory and you
load it in smaller chunks
22
In 2.X there is no bank conflict if the memory request is for the same
32-bit word. This is not valid in 1.X.
23
24
Shared Memory
__syncthreads()
- Barrier that waits for all threads of the block before continuing
- Need to make sure all data is loaded to shared before access
- Avoids race conditions
- Serializes the code: dont over use!
u_shared[i]=u[I];
__syncthreads();
if(i>0&&i<BLOCKSIZE1)
u[I]=u_shared[i]c*dt/dx*(u_shared[i]u_shared[i1]);
25
Race condition
When two or more threads want to access and operate on a memory
location without syncronization
Example: we have the value 3 stored in global memory and two
threads want to add one to that value.
- Possibility 1:
Thread 1 reads the value 3 adds 1 and writes 4 back to memory
Thread 2 reads the value 4 and writes 5 back to memory
- Possibility 2:
Thread 1 reads the value 3
Thread 2 reads the value 3
Both threads operate on 3 and write back the value 4 to memory
Solutions:
- __syncthreads() or atomic operations
26
Race condition
When two or more threads want to access and operate on a memory
location without syncronization
Example: we have the value 3 stored in global memory and two
threads want to add one to that value.
- Possibility 1:
Thread 1 reads the value 3 adds 1 and writes 4 back to memory
Thread 2 reads the value 4 and writes 5 back to memory
- Possibility 2:
Thread 1 reads the value 3
Thread 2 reads the value 3
Both threads operate on 3 and write back the value 4 to memory
Solutions:
- __syncthreads() or atomic operations
26
Race condition
When two or more threads want to access and operate on a memory
location without syncronization
Example: we have the value 3 stored in global memory and two
threads want to add one to that value.
- Possibility 1:
Thread 1 reads the value 3 adds 1 and writes 4 back to memory
Thread 2 reads the value 4 and writes 5 back to memory
- Possibility 2:
Thread 1 reads the value 3
Thread 2 reads the value 3
Both threads operate on 3 and write back the value 4 to memory
Solutions:
- __syncthreads() or atomic operations
26
Atomic operations
Atomic operations deal with race conditions
- It guarantees that while the operation is being executed, that
location in memory os not accessed
- Still we cant rely on any ordering of thread execution!
- Types
atomicAdd
atomicSub
atomicExch
atomicMin
atomicMax
etc...
27
Atomic operations
__global__update(int*values,int*who)
{
inti=threadIdx.x+blockDim.x*blockIdx.x;
intI=who[i];
atomicAdd(&values[I],1);
}
28
Atomic operations
Useful if you have a sparse access pattern
Atomic operations are slower than normal function
They can serialize your execution if many threads want to access the
same memory location
- Think about parallelizing your data, not only execution
- Use hierarchy of atomic operations to avoid this
29