0% found this document useful (0 votes)
76 views

Cuda Examples

The document provides examples of Cuda code for common parallel computing tasks including dot product, matrix-vector multiplication, sparse matrix multiplication, and parallel reduction. It also describes implementing scalar-vector addition (SAXPY) both serially and in parallel using Cuda. For sparse matrix multiplication, it shows representations for sparse matrices and implementations of serial and parallel multiplication of a sparse matrix and vector. Caching values in shared memory is also demonstrated to improve performance for sparse matrix multiplication.

Uploaded by

Swati Choudhary
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
76 views

Cuda Examples

The document provides examples of Cuda code for common parallel computing tasks including dot product, matrix-vector multiplication, sparse matrix multiplication, and parallel reduction. It also describes implementing scalar-vector addition (SAXPY) both serially and in parallel using Cuda. For sparse matrix multiplication, it shows representations for sparse matrices and implementations of serial and parallel multiplication of a sparse matrix and vector. Caching values in shared memory is also demonstrated to improve performance for sparse matrix multiplication.

Uploaded by

Swati Choudhary
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 5

ExamplesofCuda code

1) Thedotproduct
2) Matrixvectormultiplication
3) Sparsematrixmultiplication
4) Globalreduction

Computingy=ax+ywithaSerialLoop
voidsaxpy_serial(int n,floatalpha,float*x,float*y)
{
for(inti=0;i<n;++i)
y[i]=alpha*x[i]+y[i];
}
//InvokeserialSAXPYkernel
saxpy_serial(n,2.0,x,y);

Computingy=ax+yinparallelusingCUDA
_global_void saxpy_parallel(int n,floatalpha,float*x,float*y)
{
int i =blockIdx.x*blockDim.x +threadIdx.x;
if(i<n)y[i]=alpha*x[i]+y[i];
}
//InvokeparallelSAXPYkernel(256threadsperblock)\\
intnblocks=(n+255)/256;
saxpy_parallel<<<nblocks,256>>>(n,2.0,x,y);

ComputingMatrixvectormultiplicationinparallelusingCUDA
__global__voidmm_simple(float*C,float*A,float*B,int n)
{
int row=blockIdx.y *blockDim.y +threadIdx.y;
int col=blockIdx.x *blockDim.x +threadIdx.x;
floatsum=0.0f;
for(int k=0;k<n;k++){
sum+=A[row*n+k]*B[k*n+col];
}
C[row*n+col]=sum;
}

Sparsematrixrepresentation

A=

30900
05002
00700
00584
00600

Av=[395275846]=nonzeroelements
Aj =[021422342]=columnindicesofelements
Ap =[024589]=pointerstothefirstelementineachrow

Serialsparsematrix/vectormultiplication
voidcsrmul_serial(int *Ap,int *Aj,float*Av,int num_rows,
float*x,float*y)
{
for(int row=0;row<num_rows;++row)
{
int row_begin =Ap[row];
int row_end =Ap[row+1];
y[row]=multiply_row(row_end row_begin, Aj+row_begin,
Av+row_begin,x);
}
}
floatmultiply_row(int rowsize,
int *Aj,
//columnindicesforrow
float*Av,
//nonzeroentriesforrow
float*x)
//theRHSvector
{
floatsum=0;
for(int column=0;column<rowsize;++column)
sum+=Av[column]*x[Aj[column]];
returnsum;
}

Parallelsparsematrix/vectormultiplication
_global_void csrmul_kernel(int *Ap,int *Aj,float*Av,int num_rows,
float*x,float*y)
{
int row=blockIdx.x*blockDim.x +threadIdx.x;
if(row<num_rows )
{
int row_begin =Ap[row];
int row_end =Ap[row+1];
y[row]=multiply_row(row_end row_begin,Aj+row_begin,
Av+row_begin,x);
}
}

Thecodetolaunchtheaboveparallelkernelis:
unsignedint blocksize =128;//oranysizeupto512
unsignedint nblocks =(num_rows +blocksize 1)/blocksize;
csrmul_kernel<<<nblocks,blocksize>>>(Ap,Aj,Av,num_rows,x,y);

Cachinginsharedmemory

=
Block_begin
Cachein
shared
memory

Athread
Block_end

therowexecuted
byathread

Expectmostofthenon
zeroelementshere
(aroundthediagonal)

_global_void csrmul_cached(int *Ap,int *Aj,float*Av,int num_rows,constfloat*x,float*y)


{
_shared_float cache[blocksize];//Cachetherowsofx[]correspondingtothisblock.
int block_begin =blockIdx.x *blockDim.x;
int block_end =block_begin +blockDim.x;
int row=block_begin +threadIdx.x;
//Fetchandcacheourwindowofx[].
if(row<num_rows)cache[threadIdx.x]=x[row];
_syncthreads();
if(row<num_rows )
{
int row_begin =Ap[row];
int row_end =Ap[row+1];
floatx_j ,sum=0;
for(int col=row_begin;col<row_end;++col)
{
int j=Aj[col];
if(j>=block_begin &&j<block_end ) //Fetchx_j fromourcachewhenpossible
x_j =cache[jblock_begin];
else
x_j =x[j];
sum+=Av[col]*x_j;
}
y[row]=sum;
}
}

Parallelreduction
_global_void plus_reduce(int *input,int N,int *total)
{
int tid =threadIdx.x;
int i =blockIdx.x*blockDim.x +threadIdx.x;
//Eachblockloadsitselementsintosharedmemory
_shared_int x[blocksize];
x[tid]=(i<N)?input[i]:0;
//lastblockmaypadwith0s
_syncthreads();
//Buildsummationtreeoverelements.
for(int s=blockDim.x/2;s>0;s=s/2)
{
if(tid <s)x[tid]+=x[tid +s];
_syncthreads();
}
//Thread0addsthepartialsumtothetotalsum
if(tid ==0)atomicAdd(total,x[tid]);
}

You might also like