GPU Computing with CUDA
Lecture 3 - Efficient Shared Memory Use
Christopher Cooper
Boston University
August, 2011
UTFSM, Valparaso, Chile
1
Outline of lecture
Recap of Lecture 2
Shared memory in detail
Tiling
Bank conflicts
Recap
Thread hierarchy
- Thread are grouped in thread blocks
- Threads of the same block are executed on the same SM at the same
time
Threads can communicate with shared memory
An SM can have up to 8 blocks at the same time
- Thread blocks are divided sequentially into warps of 32 threads each
- Threads of the same warp are scheduled together
- SM implements a zero-overhead warp scheduling
3
Recap
Memory hierarchy
Smart use of
memory hierarchy!
4
Recap
Programming model: Finite Difference case
- One node per thread
- Node indexing automatically groups into thread blocks!
Recap
Programming model: Finite Difference case
- One node per thread
- Node indexing automatically groups into thread blocks!
Thread = node
Thread block
Shared Memory
Small (48kB per SM)
Fast (~4 cycles): On-chip
Private to each block
- Allows thread communication
How can we use it?
Shared Memory - Making use of it
Looking at a 1D FDM example (similar to lab)
u
u
=c
t
x
n+1
ui
n
ui
ct n
n
(ui ui1 )
x
__global__voidupdate(float*u,float*u_prev,intN,floatdx,floatdt,floatc,int
BLOCKSIZE)
{
//Eachthreadwillloadoneelement
inti=threadIdx.x+BLOCKSIZE*blockIdx.x;
if(i>=N){return;}
u_prev[i]=u[i];
if(i>0)
{u[i]=u_prev[i]c*dt/dx*(u_prev[i]u_prev[i1]);
}
}
Shared Memory - Making use of it
Looking at a 1D FDM example (similar to lab)
u
u
=c
t
x
n+1
ui
n
ui
ct n
n
(ui ui1 )
x
__global__voidupdate(float*u,float*u_prev,intN,floatdx,floatdt,floatc,int
BLOCKSIZE)
{
//Eachthreadwillloadoneelement
Thread i
inti=threadIdx.x+BLOCKSIZE*blockIdx.x;
if(i>=N){return;}
u_prev[i]=u[i];
if(i>0)
{u[i]=u_prev[i]c*dt/dx*(u_prev[i]u_prev[i1]);
}
}
Shared Memory - Making use of it
Looking at a 1D FDM example (similar to lab)
u
u
=c
t
x
n+1
ui
n
ui
ct n
n
(ui ui1 )
x
__global__voidupdate(float*u,float*u_prev,intN,floatdx,floatdt,floatc,int
BLOCKSIZE)
{
//Eachthreadwillloadoneelement
Thread i
inti=threadIdx.x+BLOCKSIZE*blockIdx.x;
if(i>=N){return;}
u_prev[i]=u[i];
Loads
element i
if(i>0)
{u[i]=u_prev[i]c*dt/dx*(u_prev[i]u_prev[i1]);
}
}
Shared Memory - Making use of it
Looking at a 1D FDM example (similar to lab)
u
u
=c
t
x
n+1
ui
n
ui
ct n
n
(ui ui1 )
x
__global__voidupdate(float*u,float*u_prev,intN,floatdx,floatdt,floatc,int
BLOCKSIZE)
{
//Eachthreadwillloadoneelement
Thread i
inti=threadIdx.x+BLOCKSIZE*blockIdx.x;
if(i>=N){return;}
u_prev[i]=u[i];
Loads
element i
Loads
element i-1
if(i>0)
{u[i]=u_prev[i]c*dt/dx*(u_prev[i]u_prev[i1]);
}
}
Shared Memory - Making use of it
Looking at a 1D FDM example (similar to lab)
u
u
=c
t
x
n+1
ui
n
ui
ct n
n
(ui ui1 )
x
__global__voidupdate(float*u,float*u_prev,intN,floatdx,floatdt,floatc,int
BLOCKSIZE)
{
//Eachthreadwillloadoneelement
inti=threadIdx.x+BLOCKSIZE*blockIdx.x;
if(i>=N){return;}
u_prev[i]=u[i];
if(i>0)
{u[i]=u_prev[i]c*dt/dx*(u_prev[i]u_prev[i1]);
}
}
Shared Memory - Making use of it
Looking at a 1D FDM example (similar to lab)
u
u
=c
t
x
n+1
ui
n
ui
ct n
n
(ui ui1 )
x
__global__voidupdate(float*u,float*u_prev,intN,floatdx,floatdt,floatc,int
BLOCKSIZE)
{
//Eachthreadwillloadoneelement
Thread i +1
inti=threadIdx.x+BLOCKSIZE*blockIdx.x;
if(i>=N){return;}
u_prev[i]=u[i];
if(i>0)
{u[i]=u_prev[i]c*dt/dx*(u_prev[i]u_prev[i1]);
}
}
Shared Memory - Making use of it
Looking at a 1D FDM example (similar to lab)
u
u
=c
t
x
n+1
ui
n
ui
ct n
n
(ui ui1 )
x
__global__voidupdate(float*u,float*u_prev,intN,floatdx,floatdt,floatc,int
BLOCKSIZE)
{
//Eachthreadwillloadoneelement
Thread i +1
inti=threadIdx.x+BLOCKSIZE*blockIdx.x;
if(i>=N){return;}
u_prev[i]=u[i];
Loads
element i+1
if(i>0)
{u[i]=u_prev[i]c*dt/dx*(u_prev[i]u_prev[i1]);
}
}
Shared Memory - Making use of it
Looking at a 1D FDM example (similar to lab)
u
u
=c
t
x
n+1
ui
n
ui
ct n
n
(ui ui1 )
x
__global__voidupdate(float*u,float*u_prev,intN,floatdx,floatdt,floatc,int
BLOCKSIZE)
{
//Eachthreadwillloadoneelement
Thread i +1
inti=threadIdx.x+BLOCKSIZE*blockIdx.x;
if(i>=N){return;}
u_prev[i]=u[i];
Loads
element i+1
Loads
element i
if(i>0)
{u[i]=u_prev[i]c*dt/dx*(u_prev[i]u_prev[i1]);
}
}
Shared Memory - Making use of it
Looking at a 1D FDM example (similar to lab)
u
u
=c
t
x
n+1
ui
n
ui
ct n
n
(ui ui1 )
x
__global__voidupdate(float*u,float*u_prev,intN,floatdx,floatdt,floatc,int
BLOCKSIZE)
{
//Eachthreadwillloadoneelement
Thread i +1
inti=threadIdx.x+BLOCKSIZE*blockIdx.x;
if(i>=N){return;}
u_prev[i]=u[i];
Loads
element i+1
Loads
element i
if(i>0)
{u[i]=u_prev[i]c*dt/dx*(u_prev[i]u_prev[i1]);
}
}
Order N redundant loads!
Shared Memory - Making use of it
Idea: We could load only once to shared memory, and operate there
__global__voidupdate(float*u,float*u_prev,intN,floatdx,floatdt,floatc)
{
//Eachthreadwillloadoneelement
inti=threadIdx.x;
intI=threadIdx.x+BLOCKSIZE*blockIdx.x;
__shared__floatu_shared[BLOCKSIZE];
if(I>=N){return;}
u_shared[i]=u[I];
__syncthreads();
if(I>0)
{u[I]=u_shared[i]c*dt/dx*(u_shared[i]u_shared[i1]);}
}
Shared Memory - Making use of it
Idea: We could load only once to shared memory, and operate there
__global__voidupdate(float*u,float*u_prev,intN,floatdx,floatdt,floatc)
{
//Eachthreadwillloadoneelement
inti=threadIdx.x;
intI=threadIdx.x+BLOCKSIZE*blockIdx.x;
__shared__floatu_shared[BLOCKSIZE];
Allocate shared array
if(I>=N){return;}
u_shared[i]=u[I];
__syncthreads();
if(I>0)
{u[I]=u_shared[i]c*dt/dx*(u_shared[i]u_shared[i1]);}
}
Shared Memory - Making use of it
Idea: We could load only once to shared memory, and operate there
__global__voidupdate(float*u,float*u_prev,intN,floatdx,floatdt,floatc)
{
//Eachthreadwillloadoneelement
inti=threadIdx.x;
intI=threadIdx.x+BLOCKSIZE*blockIdx.x;
__shared__floatu_shared[BLOCKSIZE];
Allocate shared array
if(I>=N){return;}
u_shared[i]=u[I];
Load to shared mem
__syncthreads();
if(I>0)
{u[I]=u_shared[i]c*dt/dx*(u_shared[i]u_shared[i1]);}
}
Shared Memory - Making use of it
Idea: We could load only once to shared memory, and operate there
__global__voidupdate(float*u,float*u_prev,intN,floatdx,floatdt,floatc)
{
//Eachthreadwillloadoneelement
inti=threadIdx.x;
intI=threadIdx.x+BLOCKSIZE*blockIdx.x;
__shared__floatu_shared[BLOCKSIZE];
Allocate shared array
if(I>=N){return;}
u_shared[i]=u[I];
Load to shared mem
__syncthreads();
if(I>0)
{u[I]=u_shared[i]c*dt/dx*(u_shared[i]u_shared[i1]);}
}
Fetch shared mem
Shared Memory - Making use of it
Idea: We could load only once to shared memory, and operate there
__global__voidupdate(float*u,float*u_prev,intN,floatdx,floatdt,floatc)
{
//Eachthreadwillloadoneelement
inti=threadIdx.x;
intI=threadIdx.x+BLOCKSIZE*blockIdx.x;
__shared__floatu_shared[BLOCKSIZE];
Allocate shared array
if(I>=N){return;}
u_shared[i]=u[I];
Load to shared mem
__syncthreads();
if(I>0)
{u[I]=u_shared[i]c*dt/dx*(u_shared[i]u_shared[i1]);}
}
Fetch shared mem
Works if N <= Block size... What if not?
8
Shared Memory - Making use of it
__global__voidupdate(float*u,float*u_prev,intN,floatdx,floatdt,floatc)
{
//Eachthreadwillloadoneelement
inti=threadIdx.x;
intI=threadIdx.x+BLOCKSIZE*blockIdx.x;
__shared__floatu_shared[BLOCKSIZE];
if(I>=N){return;}
u_prev[I]=u[I];
u_shared[i]=u[I];
__syncthreads();
if(i>0&&i<BLOCKSIZE1)
{u[I]=u_shared[i]c*dt/dx*(u_shared[i]u_shared[i1]);
}
else
{u[I]=u_prev[I]c*dt/dx*(u_prev[I]u_prev[I1]);
}
Shared Memory - Making use of it
__global__voidupdate(float*u,float*u_prev,intN,floatdx,floatdt,floatc)
{
//Eachthreadwillloadoneelement
inti=threadIdx.x;
intI=threadIdx.x+BLOCKSIZE*blockIdx.x;
__shared__floatu_shared[BLOCKSIZE];
if(I>=N){return;}
u_prev[I]=u[I];
u_shared[i]=u[I];
__syncthreads();
if(i>0&&i<BLOCKSIZE1)
{u[I]=u_shared[i]c*dt/dx*(u_shared[i]u_shared[i1]);
}
else
{u[I]=u_prev[I]c*dt/dx*(u_prev[I]u_prev[I1]);
}
Shared Memory - Making use of it
__global__voidupdate(float*u,float*u_prev,intN,floatdx,floatdt,floatc)
{
//Eachthreadwillloadoneelement
inti=threadIdx.x;
intI=threadIdx.x+BLOCKSIZE*blockIdx.x;
__shared__floatu_shared[BLOCKSIZE];
if(I>=N){return;}
u_prev[I]=u[I];
u_shared[i]=u[I];
__syncthreads();
if(i>0&&i<BLOCKSIZE1)
{u[I]=u_shared[i]c*dt/dx*(u_shared[i]u_shared[i1]);
}
else
{u[I]=u_prev[I]c*dt/dx*(u_prev[I]u_prev[I1]);
}
Reduced loads from 2*N to N+2*N/BLOCKSIZE
9
Using shared memory as cache
Looking at the 2D heat diffusion problem from lab 2
2
u
u
= 2
t
x
Explicit scheme
un+1
i,j
uni,j
k n
+ 2 (ui,j+1 + uni,j1 + uni+1,j + uni1,j 4uni,j )
h
T=0
T = 200
T=0
T = 200
10
Shared Memory Implementation - Mapping Problem
Using row major flattened array
inti=threadIdx.x;
intj=threadIdx.y;
intI=blockIdx.y*BSZ*N+blockIdx.x*BSZ+j*N+i;
Nx*(Ny-1)
I increasing
Nx*Ny-1
I increasing
Nx-1
11
Shared Memory Implementation - Global Memory
This implementation has redundant loads to global memory
slow
__global__voidupdate(float*u,float*u_prev,intN,floath,floatdt,float
alpha,intBSZ)
{
//Settingupindices
inti=threadIdx.x;
intj=threadIdx.y;
intI=blockIdx.y*BSZ*N+blockIdx.x*BSZ+j*N+i;
if(I>=N*N){return;}
u_prev[I]=u[I];
//ifnotboundarydo
if((I>N)&&(I<N*N1N)&&(I%N!=0)&&(I%N!=N1))
{u[I]=u_prev[I]+alpha*dt/(h*h)*(u_prev[I+1]+u_prev[I1]+
u_prev[I+N]+u_prev[IN]4*u_prev[I]);
}
}
12
Shared Memory Implementation - Solution 1
Recast solution given earlier
- Load to shared memory
- Use shared memory if not on boundary of a block
Global memory
- Use global memory otherwise
Advantage
- Easy to implement
Disadvantage
- Branching statement
- Still have some redundant loads
Shared memory
13
Shared Memory Implementation - Solution 1
__global__voidupdate(float*u,float*u_prev,intN,floath,floatdt,floatalpha)
{
//Settingupindices
inti=threadIdx.x;
intj=threadIdx.y;
intI=blockIdx.y*BSZ*N+blockIdx.x*BSZ+j*N+i;
if(I>=N*N){return;}
__shared__floatu_prev_sh[BSZ][BSZ];
u_prev_sh[i][j]=u[I];
u_prev[I]=u[I];
__syncthreads();
boolbound_check=((I>N)&&(I<N*N1N)&&(I%N!=0)&&(I%N!=N1));
boolblock_check=((i>0)&&(i<BSZ1)&&(j>0)&&(j<BSZ1));
//ifnotonblockboundarydo
if(block_check)
{u[I]=u_prev_sh[i][j]+alpha*dt/h/h*(u_prev_sh[i+1][j]+u_prev_sh[i1]
[j]+u_prev_sh[i][j+1]+u_prev_sh[i][j1]4*u_prev_sh[i][j]);
}
//ifnotonboundary
elseif(bound_check)
{u[I]=u_prev[I]+alpha*dt/(h*h)*(u_prev[I+1]+u_prev[I1]+u_prev[I+N]
+u_prev[IN]4*u_prev[I]);
}
14
}
Shared Memory Implementation - Solution 2
We want to avoid the reads from global memory
- Lets use halo nodes to compute block edges
Images: Mark Giles, Oxford, UK
15
Shared Memory Implementation - Solution 2
Change indexing so to
jump in steps of BSZ-2
instead of BSZ
BSZ
BSZ -2
Operate on internal
nodes
BSZ
Load data to shared
memory
BSZ -2
Well need Nx/(BSZ-2)
blocks per dimension,
instead of BSZ/N
16
Shared Memory Implementation - Solution 2
__global__voidupdate(float*u,float*u_prev,intN,floath,floatdt,floatalpha)
{
//Settingupindices
inti=threadIdx.x,j=threadIdx.y,bx=blockIdx.x,by=blockIdx.y;
intI=(BSZ2)*bx+i,J=(BSZ2)*by+j;
intIndex=I+J*N;
if(I>=N||J>=N){return;}
__shared__floatu_prev_sh[BSZ][BSZ];
u_prev_sh[i][j]=u[Index];
__syncthreads();
boolbound_check=((I!=0)&&(I<N1)&&(J!=0)&&(J<N1));
boolblock_check=((i!=0)&&(i<BSZ1)&&(j!=0)&&(j<BSZ1));
if(bound_check&&block_check)
{u[Index]=u_prev_sh[i][j]+alpha*dt/h/h*(u_prev_sh[i+1][j]+
u_prev_sh[i1][j]+u_prev_sh[i][j+1]+u_prev_sh[i][j1]4*u_prev_sh[i][j]);
}
}
17
Shared Memory Implementation - Solution 2
Weve eliminated all redundant global memory accesses!
But...
- Theres still a heavy amount of branching
GPUs are not great at branching... well look into that later today
- All threads read, but only some operate
Were underutilizing the device!
If we have 16x16 = 256 threads, all read, but only 14x14 = 196
operate, and were using only ~75% of the device. In 3D this number
drops to ~40%!
18
Shared Memory Implementation - Solution 3
We need to go further...
BSZ+2
BSZ
- Load in two stages
BSZ+2
- To not underutilize the
device, we need to
load more data than
threads
BSZ
- Operate on [i+1][j+1]
threads
19
Shared Memory Implementation - Solution 3
Loading in 2 steps
- Use the 64 available threads to load the
64 first values to shared
__shared__floatu_prev_sh[BSZ+2][BSZ+2];
intii=j*BSZ+i,//Flattenthreadindexing
I=ii%(BSZ+2),//xdirectionindexincludinghalo
J=ii/(BSZ+2);//ydirectionindexincludinghalo
intI_n=I_0+J*N+I;//Generalindex
u_prev_sh[I][J]=u[I_n];
- Load the remaining values
intii2=BSZ*BSZ+j*BSZ+i;
intI2=ii2%(BSZ+2);
intJ2=ii2/(BSZ+2);
8x8 threads
10x10 loads
intI_n2=I_0+J2*N+I2;//Generalindex
if((I2<(BSZ+2))&&(J2<(BSZ+2))&&(ii2<N*N))
u_prev_sh[I2][J2]=u[I_n2];
20
Shared Memory Implementation - Solution 3
Loading in 2 steps
- Use the 64 available threads to load the
64 first values to shared
__shared__floatu_prev_sh[BSZ+2][BSZ+2];
intii=j*BSZ+i,//Flattenthreadindexing
I=ii%(BSZ+2),//xdirectionindexincludinghalo
J=ii/(BSZ+2);//ydirectionindexincludinghalo
intI_n=I_0+J*N+I;//Generalindex
u_prev_sh[I][J]=u[I_n];
- Load the remaining values
intii2=BSZ*BSZ+j*BSZ+i;
intI2=ii2%(BSZ+2);
intJ2=ii2/(BSZ+2);
I0
8x8 threads
10x10 loads
intI_n2=I_0+J2*N+I2;//Generalindex
if((I2<(BSZ+2))&&(J2<(BSZ+2))&&(ii2<N*N))
u_prev_sh[I2][J2]=u[I_n2];
20
Shared Memory Implementation - Solution 3
Loading in 2 steps
- Use the 64 available threads to load the
64 first values to shared
64th
__shared__floatu_prev_sh[BSZ+2][BSZ+2];
intii=j*BSZ+i,//Flattenthreadindexing
I=ii%(BSZ+2),//xdirectionindexincludinghalo
J=ii/(BSZ+2);//ydirectionindexincludinghalo
intI_n=I_0+J*N+I;//Generalindex
u_prev_sh[I][J]=u[I_n];
- Load the remaining values
intii2=BSZ*BSZ+j*BSZ+i;
intI2=ii2%(BSZ+2);
intJ2=ii2/(BSZ+2);
I0
8x8 threads
10x10 loads
intI_n2=I_0+J2*N+I2;//Generalindex
if((I2<(BSZ+2))&&(J2<(BSZ+2))&&(ii2<N*N))
u_prev_sh[I2][J2]=u[I_n2];
20
Shared Memory Implementation - Solution 3
Loading in 2 steps
- Use the 64 available threads to load the
64 first values to shared
64th
__shared__floatu_prev_sh[BSZ+2][BSZ+2];
intii=j*BSZ+i,//Flattenthreadindexing
I=ii%(BSZ+2),//xdirectionindexincludinghalo
J=ii/(BSZ+2);//ydirectionindexincludinghalo
intI_n=I_0+J*N+I;//Generalindex
u_prev_sh[I][J]=u[I_n];
- Load the remaining values
intii2=BSZ*BSZ+j*BSZ+i;
intI2=ii2%(BSZ+2);
intJ2=ii2/(BSZ+2);
I0
8x8 threads
10x10 loads
intI_n2=I_0+J2*N+I2;//Generalindex
if((I2<(BSZ+2))&&(J2<(BSZ+2))&&(ii2<N*N))
u_prev_sh[I2][J2]=u[I_n2];
20
Shared Memory Implementation - Solution 3
Loading in 2 steps
- Use the 64 available threads to load the
64 first values to shared
__shared__floatu_prev_sh[BSZ+2][BSZ+2];
intii=j*BSZ+i,//Flattenthreadindexing
I=ii%(BSZ+2),//xdirectionindexincludinghalo
J=ii/(BSZ+2);//ydirectionindexincludinghalo
intI_n=I_0+J*N+I;//Generalindex
u_prev_sh[I][J]=u[I_n];
- Load the remaining values
intii2=BSZ*BSZ+j*BSZ+i;
intI2=ii2%(BSZ+2);
intJ2=ii2/(BSZ+2);
I0
8x8 threads
10x10 loads
intI_n2=I_0+J2*N+I2;//Generalindex
if((I2<(BSZ+2))&&(J2<(BSZ+2))&&(ii2<N*N))
u_prev_sh[I2][J2]=u[I_n2];
20
Shared Memory Implementation - Solution 3
Loading in 2 steps
- Use the 64 available threads to load the
64 first values to shared
__shared__floatu_prev_sh[BSZ+2][BSZ+2];
intii=j*BSZ+i,//Flattenthreadindexing
I=ii%(BSZ+2),//xdirectionindexincludinghalo
J=ii/(BSZ+2);//ydirectionindexincludinghalo
intI_n=I_0+J*N+I;//Generalindex
u_prev_sh[I][J]=u[I_n];
- Load the remaining values
intii2=BSZ*BSZ+j*BSZ+i;
intI2=ii2%(BSZ+2);
intJ2=ii2/(BSZ+2);
I0
8x8 threads
10x10 loads
intI_n2=I_0+J2*N+I2;//Generalindex
if((I2<(BSZ+2))&&(J2<(BSZ+2))&&(ii2<N*N))
u_prev_sh[I2][J2]=u[I_n2];
20
Shared Memory Implementation - Solution 3
Loading in 2 steps
- Use the 64 available threads to load the
64 first values to shared
__shared__floatu_prev_sh[BSZ+2][BSZ+2];
intii=j*BSZ+i,//Flattenthreadindexing
I=ii%(BSZ+2),//xdirectionindexincludinghalo
J=ii/(BSZ+2);//ydirectionindexincludinghalo
intI_n=I_0+J*N+I;//Generalindex
u_prev_sh[I][J]=u[I_n];
- Load the remaining values
intii2=BSZ*BSZ+j*BSZ+i;
intI2=ii2%(BSZ+2);
intJ2=ii2/(BSZ+2);
I0
8x8 threads
10x10 loads
intI_n2=I_0+J2*N+I2;//Generalindex
if((I2<(BSZ+2))&&(J2<(BSZ+2))&&(ii2<N*N))
u_prev_sh[I2][J2]=u[I_n2];
Some threads wont load
20
Shared Memory Implementation - Solution 3
Compute on interior points: threads [i+1][j+1]
Index
intIndex=by*BSZ*N+bx*BSZ+(j+1)*N+i+1;
u[Index]=u_prev_sh[i+1][j+1]+alpha*dt/h/h*(u_prev_sh[i+2][j+1]+u_prev_sh[i][j+1]+
u_prev_sh[i+1][j+2]+u_prev_sh[i+1][j]4*u_prev_sh[i+1][j+1]);
21
SM Implementation
The technique described is called tiling
- Tiling means loading data to shared memory in tiles
- Useful when shared memory is used as cache
- Also used when all data is to large to fit in shared memory and you
load it in smaller chunks
We will implement this in tomorrows lab!
22
Shared Memory - Bank conflicts
Shared memory arrays are subdivided into smaller subarrays called
banks
Shared memory has 32 (16) banks in 2.X (1.X). Successive 32-bit
words are assigned to successive banks
Different banks can be accessed simultaneously
If two or more addresses of a memory request are in the same bank,
the access is serialized
- Bank conflicts exist only within a warp (half warp for 1.X)
In 2.X there is no bank conflict if the memory request is for the same
32-bit word. This is not valid in 1.X.
23
Shared Memory - Bank conflicts
24
Shared Memory
__syncthreads()
- Barrier that waits for all threads of the block before continuing
- Need to make sure all data is loaded to shared before access
- Avoids race conditions
- Serializes the code: dont over use!
u_shared[i]=u[I];
__syncthreads();
if(i>0&&i<BLOCKSIZE1)
u[I]=u_shared[i]c*dt/dx*(u_shared[i]u_shared[i1]);
25
Race condition
When two or more threads want to access and operate on a memory
location without syncronization
Example: we have the value 3 stored in global memory and two
threads want to add one to that value.
- Possibility 1:
Thread 1 reads the value 3 adds 1 and writes 4 back to memory
Thread 2 reads the value 4 and writes 5 back to memory
- Possibility 2:
Thread 1 reads the value 3
Thread 2 reads the value 3
Both threads operate on 3 and write back the value 4 to memory
Solutions:
- __syncthreads() or atomic operations
26
Race condition
When two or more threads want to access and operate on a memory
location without syncronization
Example: we have the value 3 stored in global memory and two
threads want to add one to that value.
- Possibility 1:
Thread 1 reads the value 3 adds 1 and writes 4 back to memory
Thread 2 reads the value 4 and writes 5 back to memory
- Possibility 2:
Thread 1 reads the value 3
Thread 2 reads the value 3
Both threads operate on 3 and write back the value 4 to memory
Solutions:
- __syncthreads() or atomic operations
26
Race condition
When two or more threads want to access and operate on a memory
location without syncronization
Example: we have the value 3 stored in global memory and two
threads want to add one to that value.
- Possibility 1:
Thread 1 reads the value 3 adds 1 and writes 4 back to memory
Thread 2 reads the value 4 and writes 5 back to memory
- Possibility 2:
Thread 1 reads the value 3
Thread 2 reads the value 3
Both threads operate on 3 and write back the value 4 to memory
Solutions:
- __syncthreads() or atomic operations
26
Atomic operations
Atomic operations deal with race conditions
- It guarantees that while the operation is being executed, that
location in memory os not accessed
- Still we cant rely on any ordering of thread execution!
- Types
atomicAdd
atomicSub
atomicExch
atomicMin
atomicMax
etc...
27
Atomic operations
__global__update(int*values,int*who)
{
inti=threadIdx.x+blockDim.x*blockIdx.x;
intI=who[i];
atomicAdd(&values[I],1);
}
David Tarjan - NVIDIA
28
Atomic operations
Useful if you have a sparse access pattern
Atomic operations are slower than normal function
They can serialize your execution if many threads want to access the
same memory location
- Think about parallelizing your data, not only execution
- Use hierarchy of atomic operations to avoid this
Prefer __syncthreads() if you can use it instead
- If you have a regular access pattern
29