0% found this document useful (0 votes)

13 views26 pages

HPC Practicals

Uploaded by

Akshata Chopade

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

13 views26 pages

HPC Practicals

Uploaded by

Akshata Chopade

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 26

Code

#include <iostream>
#include <vector>
#include <queue>
#include <ctime>
#include <omp.h>

using namespace std;

// Function to perform BFS from a given vertex

void bfs(int startVertex, vector<bool> &visited, vector<vector<int>> &graph)
{
// Create a queue for BFS
queue<int> q;

// Mark the start vertex as visited and enqueue it

visited[startVertex] = true;
q.push(startVertex);

// Loop until the queue is empty

while (!q.empty())
{
// Dequeue a vertex from the queue
int v = q.front();
q.pop();

// Enqueue all adjacent vertices that are not visited

#pragma omp parallel for
for (int i = 0; i < graph[v].size(); i++)
{
int u = graph[v][i];
#pragma omp critical
{
if (!visited[u])
{
visited[u] = true;
q.push(u);
}
}
}
}
}

// Parallel Breadth-First Search

void parallelBFS(vector<vector<int>> &graph, int numCores)
{
int numVertices = graph.size();
vector<bool> visited(numVertices, false); // Keep track of visited vertices

double startTime = omp_get_wtime(); // Start timer

// Perform BFS from all unvisited vertices using specified number of cores
#pragma omp parallel for num_threads(numCores)
for (int v = 0; v < numVertices; v++)
{
if (!visited[v])
{
bfs(v, visited, graph);
}
}

double endTime = omp_get_wtime(); // End timer

cout << "Number of cores used: " << numCores << endl;
cout << "Time taken: " << endTime - startTime << " seconds" << endl;
cout << "------------------------" << endl;
}

int main()
{
// Generate a random graph with 10,000 vertices and 50,000 edges
int numVertices = 10000;
int numEdges = 50000;
vector<vector<int>> graph(numVertices);
srand(time(0));
for (int i = 0; i < numEdges; i++)
{
int u = rand() % numVertices;
int v = rand() % numVertices;
graph[u].push_back(v);
graph[v].push_back(u);
}

// Array containing number of cores

int numCoresArr[] = {1, 2, 3, 4, 5, 6, 7, 8};

// Loop over different number of cores and execute parallel BFS

for (int i = 0; i < sizeof(numCoresArr) / sizeof(numCoresArr[0]); i++)
{
int numCores = numCoresArr[i];
cout << "Running parallel BFS with " << numCores << " core(s)..." << endl;
parallelBFS(graph, numCores);
}

return 0;
}
Output
Code
#include <iostream>
#include <vector>
#include <stack>
#include <ctime>
#include <omp.h>

using namespace std;

// Function to perform DFS from a given vertex

void dfs(int startVertex, vector<bool> &visited, vector<vector<int>> &graph)
{
// Create a stack for DFS
stack<int> s;

// Mark the start vertex as visited and push it onto the stack
visited[startVertex] = true;
s.push(startVertex);

// Loop until the stack is empty

while (!s.empty())
{
// Pop a vertex from the stack
int v = s.top();
s.pop();

// Push all adjacent vertices that are not visited onto the stack
#pragma omp parallel for
for (int i = 0; i < graph[v].size(); i++)
{
int u = graph[v][i];
#pragma omp critical
{
if (!visited[u])
{
visited[u] = true;
s.push(u);
}
}
}
}
}

// Parallel Depth-First Search

void parallelDFS(vector<vector<int>> &graph, int numCores)
{
int numVertices = graph.size();
vector<bool> visited(numVertices, false); // Keep track of visited vertices

double startTime = omp_get_wtime(); // Start timer

// Perform DFS from all unvisited vertices using specified number of cores
#pragma omp parallel for num_threads(numCores)
for (int v = 0; v < numVertices; v++)
{
if (!visited[v])
{
dfs(v, visited, graph);
}
}

double endTime = omp_get_wtime(); // End timer

cout << "Number of cores used: " << numCores << endl;
cout << "Time taken: " << endTime - startTime << " seconds" << endl;
cout << "------------------------" << endl;
}

// Array containing number of cores

int numCoresArr[] = {1, 2, 3, 4, 5, 6, 7, 8};

// Loop over different number of cores and execute parallel DFS

for (int i = 0; i < sizeof(numCoresArr) / sizeof(numCoresArr[0]); i++)
{
int numCores = numCoresArr[i];
cout << "Running parallel DFS with " << numCores << " core(s)..." << endl;
parallelDFS(graph, numCores);
}

return 0;
}
Output
Code
#include <omp.h>
#include <stdlib.h>

#include <array>
#include <chrono>
#include <functional>
#include <iostream>
#include <string>
#include <vector>

using std::chrono::duration_cast;
using std::chrono::high_resolution_clock;
using std::chrono::milliseconds;
using namespace std;

void s_bubble(int *, int);

void p_bubble(int *, int);
void swap(int &, int &);

void s_bubble(int *a, int n)

{
for (int i = 0; i < n; i++)
{
int first = i % 2;
for (int j = first; j < n - 1; j += 2)
{
if (a[j] > a[j + 1])
{
swap(a[j], a[j + 1]);
}
}
}
}

void p_bubble(int *a, int n)

{
for (int i = 0; i < n; i++)
{
int first = i % 2;
#pragma omp parallel for shared(a, first) num_threads(16)
for (int j = first; j < n - 1; j += 2)
{
if (a[j] > a[j + 1])
{
swap(a[j], a[j + 1]);
}
}
}
}

void swap(int &a, int &b)

{
int test;
test = a;
a = b;
b = test;
}

int bench_traverse(std::function<void()> traverse_fn)

{
auto start = high_resolution_clock::now();
traverse_fn();
auto stop = high_resolution_clock::now();

// Subtract stop and start timepoints and cast it to required unit.

// Predefined units are nanoseconds, microseconds, milliseconds, seconds,
// minutes, hours. Use duration_cast() function.
auto duration = duration_cast<milliseconds>(stop - start);

// To get the value of duration use the count() member function on the
// duration object
return duration.count();
}

int main(int argc, const char **argv)

{
if (argc < 2)
{
cout << "Specify array length.\n";
return 1;
}
int *a, n;
n = stoi(argv[1]);
a = new int[n];

for (int i = 0; i < n; i++)

{
a[i] = rand() % n;
}

int *b = new int[n];

copy(a, a + n, b);
cout << "Generated random array of length " << n << "\n\n";

int sequentialTime = bench_traverse([&]

{ s_bubble(a, n); });

omp_set_num_threads(16);
int parallelTime = bench_traverse([&]
{ s_bubble(a, n); });

float speedUp = (float)sequentialTime / parallelTime;

float efficiency = speedUp / 16;
cout
<< "Sequential Bubble sort: " << sequentialTime << "ms\n";

cout << "Parallel (16) Bubble sort: " << parallelTime << "ms\n";

cout << "Speed Up: " << speedUp << "\n";

cout << "Efficiency: " << efficiency << "\n";

return 0;
}
Output
Code
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <iostream>

using namespace std;

#define ARRAY_SIZE 5000

void merge(int arr[], int left[], int left_size, int right[], int right_size)
{
int i = 0, j = 0, k = 0;
while (i < left_size && j < right_size)
{
if (left[i] <= right[j])
{
arr[k] = left[i];
i++;
}
else
{
arr[k] = right[j];
j++;
}
k++;
}
while (i < left_size)
{
arr[k] = left[i];
i++;
k++;
}
while (j < right_size)
{
arr[k] = right[j];
j++;
k++;
}
}

void merge_sort(int arr[], int size)

{
if (size < 2)
{
return;
}
int mid = size / 2;
int left[mid], right[size - mid];
for (int i = 0; i < mid; i++)
{
left[i] = arr[i];
}
for (int i = mid; i < size; i++)
{
right[i - mid] = arr[i];
}
#pragma omp parallel sections
{
#pragma omp section
{
merge_sort(left, mid);
}
#pragma omp section
{
merge_sort(right, size - mid);
}
}
merge(arr, left, mid, right, size - mid);
}

int main()
{
int arr[ARRAY_SIZE];
int num_threads_array[] = {16};
int num_threads_array_size = sizeof(num_threads_array) / sizeof(int);

// Initialize the array with random values

for (int i = 0; i < ARRAY_SIZE; i++)
{
arr[i] = rand() % ARRAY_SIZE;
}

// Sort the array using normal merge sort

clock_t start_time = clock();
merge_sort(arr, ARRAY_SIZE);
clock_t end_time = clock();
double normal_time = ((double)(end_time - start_time)) / CLOCKS_PER_SEC;

// Sort the array in parallel using OpenMP

for (int i = 0; i < num_threads_array_size; i++)
{
int num_threads = num_threads_array[i];
printf("Number of threads: %d\n", num_threads);
start_time = clock();
omp_set_num_threads(num_threads);
#pragma omp parallel
{
#pragma omp single
{
merge_sort(arr, ARRAY_SIZE);
}
}
end_time = clock();
double parallel_time = ((double)(end_time - start_time)) / CLOCKS_PER_SEC;

// Print the time taken by both merge sorts

printf("Time taken (normal merge sort): %f seconds\n", normal_time);
printf("Time taken (parallel merge sort): %f seconds\n", parallel_time);

float speedUp = normal_time / parallel_time;

float efficiency = speedUp / num_threads;

cout << "Speed Up: " << speedUp << "\n";

cout << "Efficiency: " << efficiency << "\n";
printf("\n");
}

return 0;
}
Output
Code
#include <limits.h>
#include <omp.h>
#include <stdlib.h>

#include <array>
#include <chrono>
#include <functional>
#include <iostream>
#include <string>
#include <vector>

using std::chrono::duration_cast;
using std::chrono::high_resolution_clock;
using std::chrono::milliseconds;
using namespace std;

void s_avg(int arr[], int n)

{
long sum = 0L;
int i;
for (i = 0; i < n; i++)
{
sum = sum + arr[i];
}
// cout << "\nAverage = " << sum / long(n) << "\n";
}

void p_avg(int arr[], int n)

{
long sum = 0L;
int i;
#pragma omp parallel for reduction(+ : sum) num_threads(16)
for (i = 0; i < n; i++)
{
sum = sum + arr[i];
}
// cout << "\nAverage = " << sum / long(n) << "\n";
}

void s_sum(int arr[], int n)

{
long sum = 0L;
int i;
for (i = 0; i < n; i++)
{
sum = sum + arr[i];
}
// cout << "\nSum = " << sum << "\n";
}

void p_sum(int arr[], int n)

{
long sum = 0L;
int i;
#pragma omp parallel for reduction(+ : sum) num_threads(16)
for (i = 0; i < n; i++)
{
sum = sum + arr[i];
}
// cout << "\nSum = " << sum << "\n";
}

void s_max(int arr[], int n)

{
int max_val = INT_MIN;
int i;
for (i = 0; i < n; i++)
{
if (arr[i] > max_val)
{
max_val = arr[i];
}
}
// cout << "\nMax value = " << max_val << "\n";
}

void p_max(int arr[], int n)

{
int max_val = INT_MIN;
int i;
#pragma omp parallel for reduction(max : max_val) num_threads(16)
for (i = 0; i < n; i++)
{
if (arr[i] > max_val)
{
max_val = arr[i];
}
}
// cout << "\nMax value = " << max_val << "\n";
}

void s_min(int arr[], int n)

{
int min_val = INT_MAX;
int i;
for (i = 0; i < n; i++)
{
if (arr[i] < min_val)
{
min_val = arr[i];
}
}
// cout << "\nMin value = " << min_val << "\n";
}

void p_min(int arr[], int n)

{
int min_val = INT_MAX;
int i;
#pragma omp parallel for reduction(min : min_val) num_threads(16)
for (i = 0; i < n; i++)
{
if (arr[i] < min_val)
{
min_val = arr[i];
}
}
// cout << "\nMin value = " << min_val << "\n";
}

int bench_traverse(std::function<void()> traverse_fn)

{
auto start = high_resolution_clock::now();
traverse_fn();
auto stop = high_resolution_clock::now();

// Subtract stop and start timepoints and cast it to required unit.

// Predefined units are nanoseconds, microseconds, milliseconds, seconds,
// minutes, hours. Use duration_cast() function.
auto duration = duration_cast<milliseconds>(stop - start);

// To get the value of duration use the count() member function on the
// duration object
duration.count();
}

int main(int argc, const char **argv)

{
if (argc < 2)
{
cout << "Specify array length.\n";
return 1;
}
int *a, n, i;

n = stoi(argv[1]);
a = new int[n];

for (int i = 0; i < n; i++)

{
a[i] = rand() % n;
}

cout << "Generated random array of length " << n << "\n\n";
omp_set_num_threads(16);
int sequentialMin = bench_traverse([&]
{ s_min(a, n); });

int parallelMin = bench_traverse([&]

{ p_min(a, n); });

int sequentialMax = bench_traverse([&]

{ s_max(a, n); });

int parallelMax = bench_traverse([&]

{ p_max(a, n); });

int sequentialSum = bench_traverse([&]

{ s_sum(a, n); });

int parallelSum = bench_traverse([&]

{ p_sum(a, n); });

int sequentialAverage = bench_traverse([&]

{ s_avg(a, n); });

int parallelAverage = bench_traverse([&]

{ p_avg(a, n); });

cout << "Sequential Min: " << sequentialMin << "ms\n";

cout << "Parallel (16) Min: " << parallelMin << "ms\n";
cout << "Speed Up for Min: " << (float)sequentialMin / parallelMin << "\n";
cout << "Efficiency for Min: " << ((float)sequentialMin / parallelMin) / 16 << "\n";

cout << "\nSequential Max: " << sequentialMax << "ms\n";

cout << "Parallel (16) Max: " << parallelMax << "ms\n";
cout << "Speed Up for Max: " << (float)sequentialMax / parallelMax << "\n";
cout << "Efficiency for Max: " << ((float)sequentialMax / parallelMax) / 16 << "\n";

cout << "\nSequential Sum: " << sequentialSum << "ms\n";

cout << "Parallel (16) Sum: " << parallelSum << "ms\n";
cout << "Speed Up for Sum: " << (float)sequentialSum / parallelSum << "\n";
cout << "Efficiency for Sum: " << ((float)sequentialSum / parallelSum) / 16 << "\n";

cout << "\nSequential Average: " << sequentialAverage << "ms\n";

cout << "Parallel (16) Average: " << parallelAverage << "ms\n";
cout << "Speed Up for Average: " << (float)sequentialAverage / parallelAverage << "\n";
cout << "Efficiency for Average: " << ((float)sequentialAverage / parallelAverage) / 16 << "\n";

return 0;
}
Output
Addition of two Large Vectors
Code
#include <stdio.h>

global void vectorAdd(float a, float b, float *c, int n)

{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n)
{
c[i] = a[i] + b[i];
}
}

int main()
{
int n = 1000000;
size_t bytes = n * sizeof(float);

// Allocate memory on the host

float *h_a = (float *)malloc(bytes);
float *h_b = (float *)malloc(bytes);
float *h_c = (float *)malloc(bytes);

// Initialize the vectors

for (int i = 0; i < n; i++)
{
h_a[i] = i;
h_b[i] = i + 1;
}

// Allocate memory on the device

float *d_a, *d_b, *d_c;
cudaMalloc(&d_a, bytes);
cudaMalloc(&d_b, bytes);
cudaMalloc(&d_c, bytes);

// Copy data from host to device

cudaMemcpy(d_a, h_a, bytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_b, bytes, cudaMemcpyHostToDevice);

// Launch kernel on the device

int threadsPerBlock = 256;
int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock;
vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c, n);

// Copy result from device to host

cudaMemcpy(h_c, d_c, bytes, cudaMemcpyDeviceToHost);

// Print first 10 elements of both vectors

printf("First 10 elements of vector a:\n");
for (int i = 0; i < 10; i++)
{
printf("%.2f ", h_a[i]);
}
printf("\n");
printf("Size of vector a: %d\n", n);
printf("\n");

printf("First 10 elements of vector b:\n");

for (int i = 0; i < 10; i++)
{
printf("%.2f ", h_b[i]);
}
printf("\n");
printf("Size of vector b: %d\n", n);
printf("\n");

// Print first 10 elements of resultant vector

printf("First 10 elements of resultant vector:\n");
for (int i = 0; i < 10; i++)
{
printf("%.2f ", h_c[i]);
}
printf("\n");

// Print size of resultant vector

printf("Size of resultant vector: %d\n", n);

// Free memory
free(h_a);
free(h_b);
free(h_c);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);

return 0;
}
Output
Matrix Multiplication using CUDA C
Code
#include <stdio.h>

#define TILE_WIDTH 32

__global__ void matrixMul(float *a, float *b, float *c, int m, int n, int p)
{
__shared__ float As[TILE_WIDTH][TILE_WIDTH];
__shared__ float Bs[TILE_WIDTH][TILE_WIDTH];

int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;

int row = by * TILE_WIDTH + ty;

int col = bx * TILE_WIDTH + tx;

float Cvalue = 0.0;

for (int k = 0; k < n / TILE_WIDTH; k++)

{
As[ty][tx] = a[row * n + k * TILE_WIDTH + tx];
Bs[ty][tx] = b[(k * TILE_WIDTH + ty) * p + col];

__syncthreads();

for (int i = 0; i < TILE_WIDTH; i++)

{
Cvalue += As[ty][i] * Bs[i][tx];
}

__syncthreads();
}

c[row * p + col] = Cvalue;

}

int main()
{
int m = 1024;
int n = 1024;
int p = 1024;
size_t bytesA = m * n * sizeof(float);
size_t bytesB = n * p * sizeof(float);
size_t bytesC = m * p * sizeof(float);

// Allocate memory on the host

float *h_a = (float *)malloc(bytesA);
float *h_b = (float *)malloc(bytesB);
float *h_c = (float *)malloc(bytesC);
// Initialize matrices
for (int i = 0; i < m * n; i++)
{
h_a[i] = 1.0;
}
for (int i = 0; i < n * p; i++)
{
h_b[i] = 2.0;
}

// Allocate memory on the device

float *d_a, *d_b, *d_c;
cudaMalloc(&d_a, bytesA);
cudaMalloc(&d_b, bytesB);
cudaMalloc(&d_c, bytesC);

// Copy data from host to device

cudaMemcpy(d_a, h_a, bytesA, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_b, bytesB, cudaMemcpyHostToDevice);

// Launch kernel on the device

dim3 dimBlock(TILE_WIDTH, TILE_WIDTH);
dim3 dimGrid((p + dimBlock.x - 1) / dimBlock.x, (m + dimBlock.y - 1) / dimBlock.y);
matrixMul<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, m, n, p);

// Copy result from device to host

cudaMemcpy(h_c, d_c, bytesC, cudaMemcpyDeviceToHost);

// Print 3x3 parts of both matrices

printf("Matrix A (3x3 part):\n");
for (int i = 0; i < 3; i++)
{
for (int j = 0; j < 3; j++)
{
printf("%.2f ", h_a[i * n + j]);
}
printf("\n");
}
printf("Size of Matrix A: %dx%d\n", m, n);
printf("\n");

printf("Matrix B (3x3 part):\n");

for (int i = 0; i < 3; i++)
{
for (int j = 0; j < 3; j++)
{
printf("%.2f ", h_b[i * p + j]);
}
printf("\n");
}
printf("Size of Matrix B: %dx%d\n", n, p);
printf("\n");

// Print 3x3 part of resultant matrix

printf("Resultant Matrix (3x3 part):\n");
for (int i = 0; i < 3; i++)
{
for (int j = 0; j < 3; j++)
{
printf("%.2f ", h_c[i * p + j]);
}
printf("\n");
}

// Print size of resultant matrix

printf("Size of Resultant Matrix: %dx%d\n", m, p);

// Free memory on the host and device

free(h_a);
free(h_b);
free(h_c);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);

return 0;
}
Output

Lab Manual Ocs351 Artificial Intelligence and Machine Learning Fundamentals
No ratings yet
Lab Manual Ocs351 Artificial Intelligence and Machine Learning Fundamentals
41 pages
HPC Lab Manual-1
100% (1)
HPC Lab Manual-1
51 pages
Lp5 DL HPC Lab Manual
No ratings yet
Lp5 DL HPC Lab Manual
60 pages
AMCAT Previous Questions Verbal English 300+ Questions With Solutions
No ratings yet
AMCAT Previous Questions Verbal English 300+ Questions With Solutions
47 pages
Part-A 1. Aim: Write A Program To Implement The Deque (Double Ended Queue) ADT Using A Doubly Linked List. Source Code
No ratings yet
Part-A 1. Aim: Write A Program To Implement The Deque (Double Ended Queue) ADT Using A Doubly Linked List. Source Code
51 pages
BFS and DFS
No ratings yet
BFS and DFS
5 pages
BCA 3rd Practical File
100% (1)
BCA 3rd Practical File
47 pages
Converted Text
No ratings yet
Converted Text
25 pages
DSA Question Bank
No ratings yet
DSA Question Bank
6 pages
Using Namespace: #Include #Include #Include #Include #Include #Include
No ratings yet
Using Namespace: #Include #Include #Include #Include #Include #Include
2 pages
SAP PI Context
No ratings yet
SAP PI Context
11 pages
Fundamentals of Data Structures in C - , 2 - Ellis Horowitz, Sahni, Dinesh Mehta
No ratings yet
Fundamentals of Data Structures in C - , 2 - Ellis Horowitz, Sahni, Dinesh Mehta
521 pages
GE8151 Problem Solving and Python Programming MCQ
No ratings yet
GE8151 Problem Solving and Python Programming MCQ
135 pages
Practical 01
No ratings yet
Practical 01
3 pages
Binary Search Tree With Tree Tranversal Techniques: EX - NO:9 Date
No ratings yet
Binary Search Tree With Tree Tranversal Techniques: EX - NO:9 Date
19 pages
Qef G
No ratings yet
Qef G
9 pages
DFS AND BFS 11 - 12
No ratings yet
DFS AND BFS 11 - 12
8 pages
Bfs
No ratings yet
Bfs
6 pages
Data Structures
No ratings yet
Data Structures
3 pages
All HPC Programs
No ratings yet
All HPC Programs
16 pages
HPC 123
No ratings yet
HPC 123
6 pages
DAA Assignment
No ratings yet
DAA Assignment
20 pages
Assignment 1
No ratings yet
Assignment 1
9 pages
Ads 1
No ratings yet
Ads 1
24 pages
Queue Data Structure Studytonight
No ratings yet
Queue Data Structure Studytonight
8 pages
9569/02 Computing
No ratings yet
9569/02 Computing
12 pages
HPC Output
No ratings yet
HPC Output
12 pages
S.No. Name of The Program Date Signat Ur E: Index
No ratings yet
S.No. Name of The Program Date Signat Ur E: Index
25 pages
AI Assignment1
No ratings yet
AI Assignment1
4 pages
Uniform Cost Search Solution by Muhammad Ahmad Teacher
No ratings yet
Uniform Cost Search Solution by Muhammad Ahmad Teacher
15 pages
Dsa All Assign (5-10)
No ratings yet
Dsa All Assign (5-10)
30 pages
HPC 1 - Merged
No ratings yet
HPC 1 - Merged
41 pages
HPCPractical 2
No ratings yet
HPCPractical 2
3 pages
DK 10
No ratings yet
DK 10
6 pages
BE LP5 Manual 23-24
No ratings yet
BE LP5 Manual 23-24
67 pages
Graph Program
No ratings yet
Graph Program
3 pages
Lab 7 Ds
No ratings yet
Lab 7 Ds
8 pages
GRAPH C
No ratings yet
GRAPH C
34 pages
Practical Bfs in Parallel Using Openmp
No ratings yet
Practical Bfs in Parallel Using Openmp
2 pages
Daa Practical File Prabhjot
No ratings yet
Daa Practical File Prabhjot
38 pages
Solution Uniform Cost Search in C++ Language Complete Code
No ratings yet
Solution Uniform Cost Search in C++ Language Complete Code
14 pages
DFS
No ratings yet
DFS
2 pages
HPC Codes
No ratings yet
HPC Codes
18 pages
HPC Codes
No ratings yet
HPC Codes
14 pages
Dsa Lab Report
No ratings yet
Dsa Lab Report
33 pages
Sds
No ratings yet
Sds
22 pages
Breath First Searc3
No ratings yet
Breath First Searc3
12 pages
Bfs
No ratings yet
Bfs
3 pages
Articulation Point
No ratings yet
Articulation Point
5 pages
LP 5 Manual
No ratings yet
LP 5 Manual
40 pages
Laboratory Practice V
No ratings yet
Laboratory Practice V
90 pages
T 3 L 14
No ratings yet
T 3 L 14
4 pages
Tabij
No ratings yet
Tabij
16 pages
HPC Codes-2
No ratings yet
HPC Codes-2
15 pages
Dfs
No ratings yet
Dfs
2 pages
Assignment 1 HPC
No ratings yet
Assignment 1 HPC
9 pages
Project Report Graph Algorithm Simulator .
No ratings yet
Project Report Graph Algorithm Simulator .
15 pages
Dsaoe Exp6b Harsh 034
No ratings yet
Dsaoe Exp6b Harsh 034
8 pages
DFS
No ratings yet
DFS
5 pages
HPC Printout 1
No ratings yet
HPC Printout 1
22 pages
Bubble Sort
No ratings yet
Bubble Sort
2 pages
V Adj
No ratings yet
V Adj
4 pages
Task 5
No ratings yet
Task 5
9 pages
Ai File
No ratings yet
Ai File
47 pages
PR 4
No ratings yet
PR 4
1 page
Tree Traversal BFS
No ratings yet
Tree Traversal BFS
7 pages
Library Study Isbt 43
No ratings yet
Library Study Isbt 43
20 pages
Document 4
No ratings yet
Document 4
2 pages
Cambridge International AS & A Level: Computer Science 9618/04
No ratings yet
Cambridge International AS & A Level: Computer Science 9618/04
10 pages
100 DataStructures Algorithms Projects
No ratings yet
100 DataStructures Algorithms Projects
3 pages
Circular Queue Data Structure
No ratings yet
Circular Queue Data Structure
13 pages
Top20 Model Paper
No ratings yet
Top20 Model Paper
17 pages
Data Structure Diu All Lab Report
No ratings yet
Data Structure Diu All Lab Report
27 pages
IMCA Sem 3 Booklet 2025
No ratings yet
IMCA Sem 3 Booklet 2025
73 pages
DS Unit 1 Notes
No ratings yet
DS Unit 1 Notes
10 pages
Chapter 11
No ratings yet
Chapter 11
37 pages
Lab Report On Data Structure Using C Lab Work
No ratings yet
Lab Report On Data Structure Using C Lab Work
36 pages
Practical Journal: AIM: Write A Program To Store The Elements in 1-D Array and Perform The Operations
No ratings yet
Practical Journal: AIM: Write A Program To Store The Elements in 1-D Array and Perform The Operations
60 pages
DS Mca
No ratings yet
DS Mca
18 pages
Queues & Linked List Creating A Queue, Adding Elements N Displaying Result
No ratings yet
Queues & Linked List Creating A Queue, Adding Elements N Displaying Result
2 pages
FoP HPC Unit II
No ratings yet
FoP HPC Unit II
107 pages
Ehb208e 1
No ratings yet
Ehb208e 1
15 pages
Dsa Mod1 Final
No ratings yet
Dsa Mod1 Final
105 pages
Week 05 LinkedList
No ratings yet
Week 05 LinkedList
48 pages
Data Structure File 1
No ratings yet
Data Structure File 1
47 pages
Data Structure File
No ratings yet
Data Structure File
65 pages
HPC 3rd Unit
No ratings yet
HPC 3rd Unit
16 pages
DATA STRUCTURE Update 2
No ratings yet
DATA STRUCTURE Update 2
118 pages
HPC Chapter 1
No ratings yet
HPC Chapter 1
12 pages
Assignment 3 Iris
No ratings yet
Assignment 3 Iris
2 pages
Plate Distribution - Report
No ratings yet
Plate Distribution - Report
24 pages
03 Analysis Exercise PDF
No ratings yet
03 Analysis Exercise PDF
10 pages
Assignment 2
No ratings yet
Assignment 2
9 pages
DBMS Notes
No ratings yet
DBMS Notes
6 pages
Assignment 3 Customer
No ratings yet
Assignment 3 Customer
3 pages
C++ Functions and tutorial
From Everand
C++ Functions and tutorial
Nino Paiotta
No ratings yet
Computer Engineering Laboratory Solution Primer
From Everand
Computer Engineering Laboratory Solution Primer
Karan Bhandari
No ratings yet

HPC Practicals

Uploaded by

HPC Practicals

Uploaded by

Code

using namespace std;

// Function to perform BFS from a given vertex

// Mark the start vertex as visited and enqueue it

// Loop until the queue is empty

// Enqueue all adjacent vertices that are not visited

// Parallel Breadth-First Search

double startTime = omp_get_wtime(); // Start timer

double endTime = omp_get_wtime(); // End timer

// Array containing number of cores

// Loop over different number of cores and execute parallel BFS

using namespace std;

// Function to perform DFS from a given vertex

// Loop until the stack is empty

// Parallel Depth-First Search

double startTime = omp_get_wtime(); // Start timer

double endTime = omp_get_wtime(); // End timer

// Array containing number of cores

// Loop over different number of cores and execute parallel DFS

void s_bubble(int *, int);

void s_bubble(int *a, int n)

void p_bubble(int *a, int n)

void swap(int &a, int &b)

int bench_traverse(std::function<void()> traverse_fn)

// Subtract stop and start timepoints and cast it to required unit.

int main(int argc, const char **argv)

for (int i = 0; i < n; i++)

int *b = new int[n];

int sequentialTime = bench_traverse([&]

float speedUp = (float)sequentialTime / parallelTime;

cout << "Speed Up: " << speedUp << "\n";

cout << "Efficiency: " << efficiency << "\n";

using namespace std;

#define ARRAY_SIZE 5000

void merge_sort(int arr[], int size)

// Initialize the array with random values

// Sort the array using normal merge sort

// Sort the array in parallel using OpenMP

// Print the time taken by both merge sorts

float speedUp = normal_time / parallel_time;

cout << "Speed Up: " << speedUp << "\n";

void s_avg(int arr[], int n)

void p_avg(int arr[], int n)

void s_sum(int arr[], int n)

void p_sum(int arr[], int n)

void s_max(int arr[], int n)

void p_max(int arr[], int n)

void s_min(int arr[], int n)

void p_min(int arr[], int n)

int bench_traverse(std::function<void()> traverse_fn)

// Subtract stop and start timepoints and cast it to required unit.

int main(int argc, const char **argv)

for (int i = 0; i < n; i++)

int parallelMin = bench_traverse([&]

int sequentialMax = bench_traverse([&]

int parallelMax = bench_traverse([&]

int sequentialSum = bench_traverse([&]

int parallelSum = bench_traverse([&]

int sequentialAverage = bench_traverse([&]

int parallelAverage = bench_traverse([&]

cout << "Sequential Min: " << sequentialMin << "ms\n";

cout << "\nSequential Max: " << sequentialMax << "ms\n";

cout << "\nSequential Sum: " << sequentialSum << "ms\n";

cout << "\nSequential Average: " << sequentialAverage << "ms\n";

__global__ void vectorAdd(float *a, float *b, float *c, int n)

// Allocate memory on the host

// Initialize the vectors

// Allocate memory on the device

// Copy data from host to device

// Launch kernel on the device

// Copy result from device to host

// Print first 10 elements of both vectors

printf("First 10 elements of vector b:\n");

// Print first 10 elements of resultant vector

// Print size of resultant vector

int row = by * TILE_WIDTH + ty;

global void vectorAdd(float a, float b, float *c, int n)