0% found this document useful (0 votes)

102 views

Input: Output: 1. Sub String Program

1. The document describes a program that implements Gauss elimination to solve a system of linear equations. 2. The program defines a matrix size N=4 and allocates memory for the matrix on the host and device. 3. A kernel is launched to perform Gauss elimination on the device matrix in parallel. 4. The eliminated matrix is copied back to host and printed. Solutions for each equation are then calculated sequentially on the host.

Uploaded by

SWETHA

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as DOC, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

102 views

Input: Output: 1. Sub String Program

Uploaded by

SWETHA

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as DOC, PDF, TXT or read online on Scribd

You are on page 1/ 8

#define SIZE 4

global void dotProduct(int a, int b, int *c)

{
int i = threadIdx.x;
*c += a[i]*b[i];
//atomicAdd(c, a[i]*b[i])
}

int main()
{
int a[SIZE] = {1,2,3,4};
int b[SIZE] = {1,2,3,4};
int c = 0;

int da, db, dc, size = SIZEsizeof(int);

cudaMalloc((void **)&da, size);
cudaMemcpy(da, a, size, cudaMemcpyHostToDevice);
cudaMalloc((void **)&db, size);
cudaMemcpy(db, b, size, cudaMemcpyHostToDevice);
cudaMalloc((void **)&dc, sizeof(int));
cudaMemcpy(dc, &c, sizeof(int), cudaMemcpyHostToDevice);

dotProduct<<<1,SIZE>>>(da, db, dc);

cudaMemcpy(&c, dc, sizeof(int), cudaMemcpyDeviceToHost);
cout<<"Dot Product : "<<c;
return 0;
}

Input:
Output:
Dot Product: 30

1. Sub String

Program:

__global__ void subString(char *da, char *db, int lena, int lenb, int *count)
{
int i = threadIdx.x, c = 0;

for (int j = 0; j<lenb; i++, j++)

if (da[i] == db[j])
c++;
if (c == lenb)
count++;
// atomicAdd(count, 1);
}
int main()
{
char a[] = "HaiHelloHowru", b[] = "H";
int lena = strlen(a), lenb = strlen(b);
cout << a << "\t" << lena << endl << b << "\t" << lenb << endl;

char da, db;

cudaMalloc((void**)&da, lena);
cudaMemcpy(da, a, lena, cudaMemcpyHostToDevice);
cudaMalloc((void**)&db, lenb);
cudaMemcpy(db, b, lenb, cudaMemcpyHostToDevice);

int c = 0, *dc;
cudaMalloc((void **)&dc, sizeof(int));
cudaMemcpy(dc, &c, sizeof(int), cudaMemcpyHostToDevice);

subString << <1, lena - lenb >> >(da, db, lena, lenb, dc);

cudaMemcpy(&c, dc, sizeof(int), cudaMemcpyDeviceToHost);

cudaDeviceSynchronize();

cout << "\nCount : " << c;

return 0;
}

Input:
Output:
HaiHelloHowru 13 H 1
Count : 3

2. Matrix Multiplication

Program:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
using namespace std;
#define SIZE 20
#define TILESIZE 2
#define WINDOW 2

global void matMul(int a, int b, int *c)

{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < SIZE)
for (int j = 0; j<SIZE; j++)
for (int k = 0; k<SIZE; k++)
c[i*SIZE + j] += a[i*SIZE + k] * b[k*SIZE + j];
}

global void matMul2D(int a, int b, int *c)

{
int j = blockIdx.x * blockDim.x + threadIdx.x;
int i = blockIdx.y * blockDim.y + threadIdx.y;
int I = i*WINDOW;
int J = j*WINDOW;
int m = I + WINDOW, n = J + WINDOW, k;

if (I < SIZE && J < SIZE)

for (i = I; i<m && i<SIZE; i++)
for (j = J; j<n && j<SIZE; j++)
for (k = 0; k<SIZE; k++)
c[i*SIZE + j] += a[i*SIZE + k] * b[k*SIZE + j];
}
void printMatrix(int *a)
{
for (int i = 0; i<SIZE; i++)
{
for (int j = 0; j<SIZE; j++)
cout << *a++ << "\t";
cout << endl;
}
}
//
int main()
{
int a[SIZE*SIZE], b[SIZE*SIZE], c[SIZE*SIZE], d[SIZE*SIZE];
int i, j, k = 0;

for (i = 0; i<SIZE; i++)

for (j = 0; j<SIZE; j++)
{
a[i*SIZE + j] = 1;
b[i*SIZE + j] = k++;
c[i*SIZE + j] = d[i*SIZE + j] = 0;
}
int *da, *db, *dc;
int size = SIZE*SIZE*sizeof(int);
cudaMalloc(&da, size);
cudaMalloc(&db, size);
cudaMalloc(&dc, size);

cudaMemcpy(da, a, size, cudaMemcpyHostToDevice);

cudaMemcpy(db, b, size, cudaMemcpyHostToDevice);
cudaMemcpy(dc, c, size, cudaMemcpyHostToDevice);

/* dim3 block(TILESIZE);
dim3 grid(SIZE/TILESIZE+1);
matMul<<<grid,block>>>(da, db, dc);
cudaMemcpy(c, dc, size, cudaMemcpyDeviceToHost);

cudaMemcpy(dc, d, size, cudaMemcpyHostToDevice);*/

dim3 block2D(TILESIZE, TILESIZE);
dim3 grid2D(SIZE / (TILESIZE*WINDOW) + 1, SIZE / (TILESIZE*WINDOW) + 1);
matMul2D << <grid2D, block2D >> >(da, db, dc);
cudaMemcpy(d, dc, size, cudaMemcpyDeviceToHost);

/* cout<<endl;
printMatrix(c);
cout<<endl;
printMatrix(d);*/
}

Input
Matrix A
22222
22222
22222
22222
22222

Matrix B
11111
11111
11111
11111
11111

Output:
Matrix C
10 10 10 10 10
10 10 10 10 10
10 10 10 10 10
10 10 10 10 10
10 10 10 10 10

3. Quick Sort
Program

#define N 5

using namespace std;

__global__ void quickSort(int *x, int *dfirst, int *dlast, int *list)
{
int idx = threadIdx.x;
int first = dfirst[idx];
int last = dlast[idx];
list[idx] = 0;

if(first<last)
{
int pivot, j, temp, i;

pivot = first;
i = first;
j = last;

while(i<j)
{
while(x[i]<=x[pivot] && i<last)
i++;
while(x[j] > x[pivot])
j--;
if(i<j)
{
temp = x[i];
x[i] = x[j];
x[j] = temp;
}
}

temp = x[pivot];
x[pivot] = x[j];
x[j] = temp;

for(i=first; i<=last; i++)

if(x[i] > x[i+1])
{
list[idx] = j+1;
break;
}
}
}

int main()
{
int a[N] = {1, 5, 9, 3, 6}, *da, i, size = N*sizeof(int), len = 0;
int *list, *dlist, *dfirst, *dlast;

cudaMalloc(&da, size);
cudaMemcpy(da, a, size, cudaMemcpyHostToDevice);
vector<int> v;

while(true)
{
size = (++len)*sizeof(int);

int first = (int )malloc(size);

int *last = (int *)malloc(size);

first[0] = 0;
last[len-1] = N-1;

for(i=0; i<v.size(); i++)

{
first[i+1] = v[i]+1;
last[i] = v[i]-1;
}

cudaMalloc(&dfirst, size);
cudaMemcpy(dfirst, first, size, cudaMemcpyHostToDevice);
cudaMalloc(&dlast, size);
cudaMemcpy(dlast, last, size, cudaMemcpyHostToDevice);

cudaMalloc(&dlist, size);

quickSort<<<1,len>>>(da, dfirst, dlast, dlist);

list = (int *)malloc(size);

cudaMemcpy(list, dlist, size, cudaMemcpyDeviceToHost);

v.clear();
for(i=0; i<len; i++)
if(list[i] != 0)
v.push_back(list[i]-1);
len = v.size();

if(len == 0)
break;
}

cudaMemcpy(a, da, N*sizeof(int), cudaMemcpyDeviceToHost);

for(i=0; i<N; i++)
printf("%d\t", a[i]);
}

Input:
15936
Output:
13569

4. Gauss Elimination

Program
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>

#define N 4
#define M N+1

global void ge(float *A, int i)

{
for (int j = blockIdx.x + i + 1; j<N; j += gridDim.x)
{
float ratio = A[j*(N + 1) + i] / A[i*(N + 1) + i];
for (int k = threadIdx.x + i; k <= N; k += blockDim.x)
A[j*(N + 1) + k] -= ratio*A[i*(N + 1) + k];
}
}

void print(float *A)

{
for (int i = 0; i<N; i++)
{
for (int j = 0; j<N + 1; j++)
printf("%f ", A[i*(N + 1) + j]);
printf("\n");
}
printf("\n");
}

int main()
{
float A[N*(N + 1)] = { 2, 1, -1, 2, 5, 4, 5, -3, 6, 9, -2, 5, -2, 6,
4, 4, 11, -4, 8, 2 };
float *dev_a;
int size = N*(N + 1)*sizeof(float), i, j;

print(A);

cudaMalloc(&dev_a, size);
cudaMemcpy(dev_a, A, size, cudaMemcpyHostToDevice);

for (i = 0; i<N - 1; i++)

ge << <N - 1 - i, N - i + 1 >> >(dev_a, i);

cudaMemcpy(A, dev_a, size, cudaMemcpyDeviceToHost);

print(A);

for (i = N - 1; i >= 0; i--)

{
for (j = N - 1; j >= i + 1; j--)
A[i*(N + 1) + N] -= A[i*(N + 1) + j];
A[i*(N + 1) + N] /= A[i*(N + 1) + j];
printf("x%d=%.1f\n", i, A[i*(N + 1) + N]);

for (j = i - 1; j >= 0; j--)

A[j*(N + 1) + i] *= A[i*(N + 1) + N];
}
return 0;
}

CSEC Mathematics January 2023 - Paper 2
100% (4)
CSEC Mathematics January 2023 - Paper 2
19 pages
Lab 3
No ratings yet
Lab 3
1 page
Revision Guide Higher Geometry and Measure Worksheet PDF
No ratings yet
Revision Guide Higher Geometry and Measure Worksheet PDF
4 pages
20 Quiz 14
No ratings yet
20 Quiz 14
12 pages
Department of Computer Engineering BE Laboratory Practice-I A.Y 2021-22 SEM1
No ratings yet
Department of Computer Engineering BE Laboratory Practice-I A.Y 2021-22 SEM1
45 pages
PDC assignment
No ratings yet
PDC assignment
9 pages
Rishi
No ratings yet
Rishi
30 pages
CUDA
No ratings yet
CUDA
3 pages
Web GPU
0% (1)
Web GPU
40 pages
5-computation
No ratings yet
5-computation
13 pages
BECOA157 Parallel Matrix Multiplication
No ratings yet
BECOA157 Parallel Matrix Multiplication
3 pages
cuda
No ratings yet
cuda
4 pages
Multithreaded Architectures: Memory and Data Locality
No ratings yet
Multithreaded Architectures: Memory and Data Locality
39 pages
Cuda Firstprograms PDF
No ratings yet
Cuda Firstprograms PDF
6 pages
3 Some Commonly Used CUDA API: 3.1 Function Type Qualifiers
No ratings yet
3 Some Commonly Used CUDA API: 3.1 Function Type Qualifiers
7 pages
Mulmatrix Cu
No ratings yet
Mulmatrix Cu
3 pages
Lab Report 6
No ratings yet
Lab Report 6
12 pages
cuuda nvidai guide_Part3
No ratings yet
cuuda nvidai guide_Part3
15 pages
HPC (Pra 04)
No ratings yet
HPC (Pra 04)
11 pages
Vector Addition
No ratings yet
Vector Addition
3 pages
Matrix Mult
100% (1)
Matrix Mult
55 pages
LP 1,,1
No ratings yet
LP 1,,1
5 pages
CUDA_part-2
No ratings yet
CUDA_part-2
49 pages
7. Moving to Parallel - Addition of 2 Matrices
No ratings yet
7. Moving to Parallel - Addition of 2 Matrices
14 pages
CUDA Exercises
No ratings yet
CUDA Exercises
185 pages
My Experiments: Opencl Gpu Matrix Multiplication Program
No ratings yet
My Experiments: Opencl Gpu Matrix Multiplication Program
19 pages
217 Lec3
No ratings yet
217 Lec3
46 pages
Hpc file
No ratings yet
Hpc file
22 pages
joint_matrix_bfloat16_modified
No ratings yet
joint_matrix_bfloat16_modified
4 pages
Lecture5 2
No ratings yet
Lecture5 2
46 pages
Cuda Notes From Udacity Lecture
No ratings yet
Cuda Notes From Udacity Lecture
3 pages
Allocate The Device Memory Where We Will Copy M
No ratings yet
Allocate The Device Memory Where We Will Copy M
2 pages
HPC-Practical-4Addition of two large vectors
No ratings yet
HPC-Practical-4Addition of two large vectors
4 pages
作业2
No ratings yet
作业2
5 pages
Lab 7
No ratings yet
Lab 7
3 pages
Matrix-Matrix Multiplication Using Shared Memory
No ratings yet
Matrix-Matrix Multiplication Using Shared Memory
27 pages
HPC Int2 Key
No ratings yet
HPC Int2 Key
10 pages
2023-CSC14120-Lecture01-CUDAIntroduction
No ratings yet
2023-CSC14120-Lecture01-CUDAIntroduction
32 pages
OpenAcc Assignment Questions
No ratings yet
OpenAcc Assignment Questions
11 pages
ECE408 S19 ZJUI Exam1 Study Guide
No ratings yet
ECE408 S19 ZJUI Exam1 Study Guide
25 pages
3-CUDA
No ratings yet
3-CUDA
5 pages
UNIT-5 Tiling
No ratings yet
UNIT-5 Tiling
23 pages
tilining
No ratings yet
tilining
23 pages
Introduction To CUDA: CAP 4730 Spring 2012
No ratings yet
Introduction To CUDA: CAP 4730 Spring 2012
35 pages
CG
No ratings yet
CG
28 pages
C++ II answer_key
No ratings yet
C++ II answer_key
58 pages
Module 4.1 - Memory and Data Locality: GPU Teaching Kit
No ratings yet
Module 4.1 - Memory and Data Locality: GPU Teaching Kit
132 pages
Matrix Operations C Program
No ratings yet
Matrix Operations C Program
10 pages
Advanced Prog. Lab File
No ratings yet
Advanced Prog. Lab File
15 pages
OOPs Practical Final
No ratings yet
OOPs Practical Final
27 pages
Oop Practicals 1-14
No ratings yet
Oop Practicals 1-14
36 pages
LEC12-Optimization and New Trends
No ratings yet
LEC12-Optimization and New Trends
23 pages
Class4 Advanced Cuda Opencl
No ratings yet
Class4 Advanced Cuda Opencl
64 pages
CUDA Programming: Johan Seland Johan - Seland@sintef - No
No ratings yet
CUDA Programming: Johan Seland Johan - Seland@sintef - No
76 pages
Processors
No ratings yet
Processors
25 pages
6963 Midterm Review
No ratings yet
6963 Midterm Review
20 pages
09 Pointers Arrays
No ratings yet
09 Pointers Arrays
34 pages
217 Lec2
No ratings yet
217 Lec2
24 pages
Csnb594csnb4423 Lab 5 01a Harveen Velan Sw0104101
No ratings yet
Csnb594csnb4423 Lab 5 01a Harveen Velan Sw0104101
19 pages
Threads
No ratings yet
Threads
6 pages
HPC Codes-2
No ratings yet
HPC Codes-2
15 pages
150+ C Pattern Programs
From Everand
150+ C Pattern Programs
Hernando Abella
No ratings yet
Grade 10 Revision Quiz
No ratings yet
Grade 10 Revision Quiz
5 pages
Test On Trigonometry
No ratings yet
Test On Trigonometry
2 pages
f3 Maths Simplified Notes Sp
No ratings yet
f3 Maths Simplified Notes Sp
32 pages
circle-theorems-ZFMykqmzt8pvzVF2
No ratings yet
circle-theorems-ZFMykqmzt8pvzVF2
69 pages
Geometria Esferica
No ratings yet
Geometria Esferica
1 page
X M PP PB 2 ( BBPS 2024 - 2025 )
No ratings yet
X M PP PB 2 ( BBPS 2024 - 2025 )
7 pages
學友社 2021 Mock Paper 2 Ans
No ratings yet
學友社 2021 Mock Paper 2 Ans
14 pages
MG P2 Mathematical Literacy P2 Nov 2024 MG Afr & Eng
No ratings yet
MG P2 Mathematical Literacy P2 Nov 2024 MG Afr & Eng
13 pages
4.GM_LAS_Multiplication-Division-of-Functions
No ratings yet
4.GM_LAS_Multiplication-Division-of-Functions
1 page
Maths Project
No ratings yet
Maths Project
7 pages
Lambert GB
No ratings yet
Lambert GB
24 pages
4MB1 01 MSC 20210304
No ratings yet
4MB1 01 MSC 20210304
21 pages
Maths Sample Paper-09
No ratings yet
Maths Sample Paper-09
4 pages
Microsoft Word - HCF and LCM
No ratings yet
Microsoft Word - HCF and LCM
4 pages
Trigonometry-3 Jee Main and Advanced
No ratings yet
Trigonometry-3 Jee Main and Advanced
4 pages
Integers Practice Questions
100% (1)
Integers Practice Questions
7 pages
Iygb Gce: Core Mathematics C1 Advanced Subsidiary
No ratings yet
Iygb Gce: Core Mathematics C1 Advanced Subsidiary
7 pages
Inverse Trigonometric Functions
No ratings yet
Inverse Trigonometric Functions
11 pages
Past Year SPM Vectors Questions
50% (2)
Past Year SPM Vectors Questions
6 pages
Elimination 1
No ratings yet
Elimination 1
1 page
Name Date: Find The Least Common Multiple of Each Number Pair
No ratings yet
Name Date: Find The Least Common Multiple of Each Number Pair
1 page
2021 2022 f2 Maths Final Question
No ratings yet
2021 2022 f2 Maths Final Question
16 pages
QP (12-Dec-2018) With 600 Questions
No ratings yet
QP (12-Dec-2018) With 600 Questions
38 pages
Khan Academy Contents
No ratings yet
Khan Academy Contents
3 pages
Math Report 1
No ratings yet
Math Report 1
34 pages
Mathematics: Quarter 1 - Module 9
75% (4)
Mathematics: Quarter 1 - Module 9
33 pages
Selina Concise Maths Solutions Class 7 Chapter 2 Rational Numbers
No ratings yet
Selina Concise Maths Solutions Class 7 Chapter 2 Rational Numbers
77 pages
Bearing Powerpoint - Mathematics
100% (1)
Bearing Powerpoint - Mathematics
82 pages

Input: Output: 1. Sub String Program

Uploaded by

Input: Output: 1. Sub String Program

Uploaded by

#define SIZE 4

__global__ void dotProduct(int *a, int *b, int *c)

int *da, *db, *dc, size = SIZE*sizeof(int);

dotProduct<<<1,SIZE>>>(da, db, dc);

for (int j = 0; j<lenb; i++, j++)

char *da, *db;

cudaMemcpy(&c, dc, sizeof(int), cudaMemcpyDeviceToHost);

cout << "\nCount : " << c;

__global__ void matMul(int *a, int *b, int *c)

__global__ void matMul2D(int *a, int *b, int *c)

if (I < SIZE && J < SIZE)

for (i = 0; i<SIZE; i++)

cudaMemcpy(da, a, size, cudaMemcpyHostToDevice);

cudaMemcpy(dc, d, size, cudaMemcpyHostToDevice);*/

using namespace std;

for(i=first; i<=last; i++)

int *first = (int *)malloc(size);

for(i=0; i<v.size(); i++)

quickSort<<<1,len>>>(da, dfirst, dlast, dlist);

list = (int *)malloc(size);

cudaMemcpy(a, da, N*sizeof(int), cudaMemcpyDeviceToHost);

__global__ void ge(float *A, int i)

void print(float *A)

for (i = 0; i<N - 1; i++)

cudaMemcpy(A, dev_a, size, cudaMemcpyDeviceToHost);

for (i = N - 1; i >= 0; i--)

for (j = i - 1; j >= 0; j--)

You might also like

global void dotProduct(int a, int b, int *c)

int da, db, dc, size = SIZEsizeof(int);

char da, db;

global void matMul(int a, int b, int *c)

global void matMul2D(int a, int b, int *c)

int first = (int )malloc(size);

global void ge(float *A, int i)