Cuda Examples
Cuda Examples
1) Thedotproduct
2) Matrixvectormultiplication
3) Sparsematrixmultiplication
4) Globalreduction
Computingy=ax+ywithaSerialLoop
voidsaxpy_serial(int n,floatalpha,float*x,float*y)
{
for(inti=0;i<n;++i)
y[i]=alpha*x[i]+y[i];
}
//InvokeserialSAXPYkernel
saxpy_serial(n,2.0,x,y);
Computingy=ax+yinparallelusingCUDA
_global_void saxpy_parallel(int n,floatalpha,float*x,float*y)
{
int i =blockIdx.x*blockDim.x +threadIdx.x;
if(i<n)y[i]=alpha*x[i]+y[i];
}
//InvokeparallelSAXPYkernel(256threadsperblock)\\
intnblocks=(n+255)/256;
saxpy_parallel<<<nblocks,256>>>(n,2.0,x,y);
ComputingMatrixvectormultiplicationinparallelusingCUDA
__global__voidmm_simple(float*C,float*A,float*B,int n)
{
int row=blockIdx.y *blockDim.y +threadIdx.y;
int col=blockIdx.x *blockDim.x +threadIdx.x;
floatsum=0.0f;
for(int k=0;k<n;k++){
sum+=A[row*n+k]*B[k*n+col];
}
C[row*n+col]=sum;
}
Sparsematrixrepresentation
A=
30900
05002
00700
00584
00600
Av=[395275846]=nonzeroelements
Aj =[021422342]=columnindicesofelements
Ap =[024589]=pointerstothefirstelementineachrow
Serialsparsematrix/vectormultiplication
voidcsrmul_serial(int *Ap,int *Aj,float*Av,int num_rows,
float*x,float*y)
{
for(int row=0;row<num_rows;++row)
{
int row_begin =Ap[row];
int row_end =Ap[row+1];
y[row]=multiply_row(row_end row_begin, Aj+row_begin,
Av+row_begin,x);
}
}
floatmultiply_row(int rowsize,
int *Aj,
//columnindicesforrow
float*Av,
//nonzeroentriesforrow
float*x)
//theRHSvector
{
floatsum=0;
for(int column=0;column<rowsize;++column)
sum+=Av[column]*x[Aj[column]];
returnsum;
}
Parallelsparsematrix/vectormultiplication
_global_void csrmul_kernel(int *Ap,int *Aj,float*Av,int num_rows,
float*x,float*y)
{
int row=blockIdx.x*blockDim.x +threadIdx.x;
if(row<num_rows )
{
int row_begin =Ap[row];
int row_end =Ap[row+1];
y[row]=multiply_row(row_end row_begin,Aj+row_begin,
Av+row_begin,x);
}
}
Thecodetolaunchtheaboveparallelkernelis:
unsignedint blocksize =128;//oranysizeupto512
unsignedint nblocks =(num_rows +blocksize 1)/blocksize;
csrmul_kernel<<<nblocks,blocksize>>>(Ap,Aj,Av,num_rows,x,y);
Cachinginsharedmemory
=
Block_begin
Cachein
shared
memory
Athread
Block_end
therowexecuted
byathread
Expectmostofthenon
zeroelementshere
(aroundthediagonal)
Parallelreduction
_global_void plus_reduce(int *input,int N,int *total)
{
int tid =threadIdx.x;
int i =blockIdx.x*blockDim.x +threadIdx.x;
//Eachblockloadsitselementsintosharedmemory
_shared_int x[blocksize];
x[tid]=(i<N)?input[i]:0;
//lastblockmaypadwith0s
_syncthreads();
//Buildsummationtreeoverelements.
for(int s=blockDim.x/2;s>0;s=s/2)
{
if(tid <s)x[tid]+=x[tid +s];
_syncthreads();
}
//Thread0addsthepartialsumtothetotalsum
if(tid ==0)atomicAdd(total,x[tid]);
}