0% found this document useful (0 votes)
96 views

Input: Output: 1. Sub String Program

1. The document describes a program that implements Gauss elimination to solve a system of linear equations. 2. The program defines a matrix size N=4 and allocates memory for the matrix on the host and device. 3. A kernel is launched to perform Gauss elimination on the device matrix in parallel. 4. The eliminated matrix is copied back to host and printed. Solutions for each equation are then calculated sequentially on the host.

Uploaded by

SWETHA
Copyright
© © All Rights Reserved
Available Formats
Download as DOC, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
96 views

Input: Output: 1. Sub String Program

1. The document describes a program that implements Gauss elimination to solve a system of linear equations. 2. The program defines a matrix size N=4 and allocates memory for the matrix on the host and device. 3. A kernel is launched to perform Gauss elimination on the device matrix in parallel. 4. The eliminated matrix is copied back to host and printed. Solutions for each equation are then calculated sequentially on the host.

Uploaded by

SWETHA
Copyright
© © All Rights Reserved
Available Formats
Download as DOC, PDF, TXT or read online on Scribd
You are on page 1/ 8

#define SIZE 4

__global__ void dotProduct(int *a, int *b, int *c)


{
int i = threadIdx.x;
*c += a[i]*b[i];
//atomicAdd(c, a[i]*b[i])
}

int main()
{
int a[SIZE] = {1,2,3,4};
int b[SIZE] = {1,2,3,4};
int c = 0;

int *da, *db, *dc, size = SIZE*sizeof(int);


cudaMalloc((void **)&da, size);
cudaMemcpy(da, a, size, cudaMemcpyHostToDevice);
cudaMalloc((void **)&db, size);
cudaMemcpy(db, b, size, cudaMemcpyHostToDevice);
cudaMalloc((void **)&dc, sizeof(int));
cudaMemcpy(dc, &c, sizeof(int), cudaMemcpyHostToDevice);

dotProduct<<<1,SIZE>>>(da, db, dc);


cudaMemcpy(&c, dc, sizeof(int), cudaMemcpyDeviceToHost);
cout<<"Dot Product : "<<c;
return 0;
}

Input:
Output:
Dot Product: 30

1. Sub String

Program:

__global__ void subString(char *da, char *db, int lena, int lenb, int *count)
{
int i = threadIdx.x, c = 0;

for (int j = 0; j<lenb; i++, j++)


if (da[i] == db[j])
c++;
if (c == lenb)
count++;
// atomicAdd(count, 1);
}
int main()
{
char a[] = "HaiHelloHowru", b[] = "H";
int lena = strlen(a), lenb = strlen(b);
cout << a << "\t" << lena << endl << b << "\t" << lenb << endl;

char *da, *db;


cudaMalloc((void**)&da, lena);
cudaMemcpy(da, a, lena, cudaMemcpyHostToDevice);
cudaMalloc((void**)&db, lenb);
cudaMemcpy(db, b, lenb, cudaMemcpyHostToDevice);

int c = 0, *dc;
cudaMalloc((void **)&dc, sizeof(int));
cudaMemcpy(dc, &c, sizeof(int), cudaMemcpyHostToDevice);

subString << <1, lena - lenb >> >(da, db, lena, lenb, dc);

cudaMemcpy(&c, dc, sizeof(int), cudaMemcpyDeviceToHost);

cudaDeviceSynchronize();

cout << "\nCount : " << c;

return 0;
}

Input:
Output:
HaiHelloHowru 13 H 1
Count : 3

2. Matrix Multiplication

Program:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
using namespace std;
#define SIZE 20
#define TILESIZE 2
#define WINDOW 2

__global__ void matMul(int *a, int *b, int *c)


{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < SIZE)
for (int j = 0; j<SIZE; j++)
for (int k = 0; k<SIZE; k++)
c[i*SIZE + j] += a[i*SIZE + k] * b[k*SIZE + j];
}

__global__ void matMul2D(int *a, int *b, int *c)


{
int j = blockIdx.x * blockDim.x + threadIdx.x;
int i = blockIdx.y * blockDim.y + threadIdx.y;
int I = i*WINDOW;
int J = j*WINDOW;
int m = I + WINDOW, n = J + WINDOW, k;

if (I < SIZE && J < SIZE)


for (i = I; i<m && i<SIZE; i++)
for (j = J; j<n && j<SIZE; j++)
for (k = 0; k<SIZE; k++)
c[i*SIZE + j] += a[i*SIZE + k] * b[k*SIZE + j];
}
void printMatrix(int *a)
{
for (int i = 0; i<SIZE; i++)
{
for (int j = 0; j<SIZE; j++)
cout << *a++ << "\t";
cout << endl;
}
}
//
int main()
{
int a[SIZE*SIZE], b[SIZE*SIZE], c[SIZE*SIZE], d[SIZE*SIZE];
int i, j, k = 0;

for (i = 0; i<SIZE; i++)


for (j = 0; j<SIZE; j++)
{
a[i*SIZE + j] = 1;
b[i*SIZE + j] = k++;
c[i*SIZE + j] = d[i*SIZE + j] = 0;
}
int *da, *db, *dc;
int size = SIZE*SIZE*sizeof(int);
cudaMalloc(&da, size);
cudaMalloc(&db, size);
cudaMalloc(&dc, size);

cudaMemcpy(da, a, size, cudaMemcpyHostToDevice);


cudaMemcpy(db, b, size, cudaMemcpyHostToDevice);
cudaMemcpy(dc, c, size, cudaMemcpyHostToDevice);

/* dim3 block(TILESIZE);
dim3 grid(SIZE/TILESIZE+1);
matMul<<<grid,block>>>(da, db, dc);
cudaMemcpy(c, dc, size, cudaMemcpyDeviceToHost);

cudaMemcpy(dc, d, size, cudaMemcpyHostToDevice);*/


dim3 block2D(TILESIZE, TILESIZE);
dim3 grid2D(SIZE / (TILESIZE*WINDOW) + 1, SIZE / (TILESIZE*WINDOW) + 1);
matMul2D << <grid2D, block2D >> >(da, db, dc);
cudaMemcpy(d, dc, size, cudaMemcpyDeviceToHost);

/* cout<<endl;
printMatrix(c);
cout<<endl;
printMatrix(d);*/
}

Input
Matrix A
22222
22222
22222
22222
22222

Matrix B
11111
11111
11111
11111
11111

Output:
Matrix C
10 10 10 10 10
10 10 10 10 10
10 10 10 10 10
10 10 10 10 10
10 10 10 10 10

3. Quick Sort
Program

#define N 5

using namespace std;

__global__ void quickSort(int *x, int *dfirst, int *dlast, int *list)
{
int idx = threadIdx.x;
int first = dfirst[idx];
int last = dlast[idx];
list[idx] = 0;

if(first<last)
{
int pivot, j, temp, i;

pivot = first;
i = first;
j = last;

while(i<j)
{
while(x[i]<=x[pivot] && i<last)
i++;
while(x[j] > x[pivot])
j--;
if(i<j)
{
temp = x[i];
x[i] = x[j];
x[j] = temp;
}
}

temp = x[pivot];
x[pivot] = x[j];
x[j] = temp;

for(i=first; i<=last; i++)


if(x[i] > x[i+1])
{
list[idx] = j+1;
break;
}
}
}

int main()
{
int a[N] = {1, 5, 9, 3, 6}, *da, i, size = N*sizeof(int), len = 0;
int *list, *dlist, *dfirst, *dlast;

cudaMalloc(&da, size);
cudaMemcpy(da, a, size, cudaMemcpyHostToDevice);
vector<int> v;

while(true)
{
size = (++len)*sizeof(int);

int *first = (int *)malloc(size);


int *last = (int *)malloc(size);

first[0] = 0;
last[len-1] = N-1;

for(i=0; i<v.size(); i++)


{
first[i+1] = v[i]+1;
last[i] = v[i]-1;
}

cudaMalloc(&dfirst, size);
cudaMemcpy(dfirst, first, size, cudaMemcpyHostToDevice);
cudaMalloc(&dlast, size);
cudaMemcpy(dlast, last, size, cudaMemcpyHostToDevice);

cudaMalloc(&dlist, size);

quickSort<<<1,len>>>(da, dfirst, dlast, dlist);

list = (int *)malloc(size);


cudaMemcpy(list, dlist, size, cudaMemcpyDeviceToHost);

v.clear();
for(i=0; i<len; i++)
if(list[i] != 0)
v.push_back(list[i]-1);
len = v.size();

if(len == 0)
break;
}

cudaMemcpy(a, da, N*sizeof(int), cudaMemcpyDeviceToHost);


for(i=0; i<N; i++)
printf("%d\t", a[i]);
}

Input:
15936
Output:
13569

4. Gauss Elimination

Program
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>

#define N 4
#define M N+1

__global__ void ge(float *A, int i)


{
for (int j = blockIdx.x + i + 1; j<N; j += gridDim.x)
{
float ratio = A[j*(N + 1) + i] / A[i*(N + 1) + i];
for (int k = threadIdx.x + i; k <= N; k += blockDim.x)
A[j*(N + 1) + k] -= ratio*A[i*(N + 1) + k];
}
}

void print(float *A)


{
for (int i = 0; i<N; i++)
{
for (int j = 0; j<N + 1; j++)
printf("%f ", A[i*(N + 1) + j]);
printf("\n");
}
printf("\n");
}

int main()
{
float A[N*(N + 1)] = { 2, 1, -1, 2, 5, 4, 5, -3, 6, 9, -2, 5, -2, 6,
4, 4, 11, -4, 8, 2 };
float *dev_a;
int size = N*(N + 1)*sizeof(float), i, j;

print(A);

cudaMalloc(&dev_a, size);
cudaMemcpy(dev_a, A, size, cudaMemcpyHostToDevice);

for (i = 0; i<N - 1; i++)


ge << <N - 1 - i, N - i + 1 >> >(dev_a, i);

cudaMemcpy(A, dev_a, size, cudaMemcpyDeviceToHost);


print(A);

for (i = N - 1; i >= 0; i--)


{
for (j = N - 1; j >= i + 1; j--)
A[i*(N + 1) + N] -= A[i*(N + 1) + j];
A[i*(N + 1) + N] /= A[i*(N + 1) + j];
printf("x%d=%.1f\n", i, A[i*(N + 1) + N]);

for (j = i - 1; j >= 0; j--)


A[j*(N + 1) + i] *= A[i*(N + 1) + N];
}
return 0;
}

You might also like