0% found this document useful (0 votes)
5 views14 pages

HPC Codes

The document contains multiple code examples demonstrating parallel algorithms in Python and CUDA for graph traversal (DFS and BFS), sorting (Bubble Sort and Merge Sort), and matrix/vector operations. Each example includes a sequential and parallel implementation, along with timing comparisons for performance evaluation. The outputs of the algorithms are also provided, showcasing the results of the operations performed.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
5 views14 pages

HPC Codes

The document contains multiple code examples demonstrating parallel algorithms in Python and CUDA for graph traversal (DFS and BFS), sorting (Bubble Sort and Merge Sort), and matrix/vector operations. Each example includes a sequential and parallel implementation, along with timing comparisons for performance evaluation. The outputs of the algorithms are also provided, showcasing the results of the operations performed.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 14

EXP1

Code:

import threading

from queue import Queue

class Graph:

def __init__(self, V):

self.V = V

self.adj = [[] for _ in range(V)]

self.lock = threading.Lock() # Lock for thread safety

def add_edge(self, v, w):

self.adj[v].append(w)

# Parallel Depth-First Search

def parallel_dfs(self, start_vertex):

visited = [False] * self.V

threads = []

def dfs_util(v):

with self.lock:

if visited[v]:

return

visited[v] = True

print(v, end=" ")

# Start threads for each adjacent node

local_threads = []
for n in self.adj[v]:

if not visited[n]:

t = threading.Thread(target=dfs_util, args=(n,))

local_threads.append(t)

t.start()

# Wait for all threads to finish

for t in local_threads:

t.join()

# Start DFS from the initial node

t = threading.Thread(target=dfs_util, args=(start_vertex,))

threads.append(t)

t.start()

for t in threads:

t.join()

# Parallel Breadth-First Search

def parallel_bfs(self, start_vertex):

visited = [False] * self.V

q = Queue()

q.put(start_vertex)

visited[start_vertex] = True

while not q.empty():

q_size = q.qsize()

threads = []
def process_node():

while not q.empty():

v = q.get()

print(v, end=" ")

# Add adjacent nodes to the queue

for n in self.adj[v]:

with self.lock:

if not visited[n]:

visited[n] = True

q.put(n)

# Launch multiple threads to process BFS in parallel

for _ in range(q_size):

t = threading.Thread(target=process_node)

threads.append(t)

t.start()

for t in threads:

t.join()

# Create a graph

g = Graph(7)

g.add_edge(0, 1)

g.add_edge(0, 2)

g.add_edge(1, 3)

g.add_edge(1, 4)
g.add_edge(2, 5)

g.add_edge(2, 6)

"""

0 -------->1

| /\

| / \

| / \

v v v

2 ----> 3 4

| |

| |

v v

5 6

"""

print("Depth-First Search (DFS): ", end="")

g.parallel_dfs(0)

print("\nBreadth-First Search (BFS): ", end="")

g.parallel_bfs(0)

print()

OUTPUT:

Depth-First Search (DFS): 0 1 3 2 4 5 6

Breadth-First Search (BFS): 0 1 2 3 4 5 6


EXP 2

CODE:

import time

import multiprocessing

# Sequential Bubble Sort

def bubble_sort(arr):

n = len(arr)

for i in range(n - 1):

for j in range(n - i - 1):

if arr[j] > arr[j + 1]:

arr[j], arr[j + 1] = arr[j + 1], arr[j]

# Parallel Odd-Even Bubble Sort

def parallel_bubble_sort(arr):

n = len(arr)

def odd_even_pass(is_odd):

"""Sort odd or even indexed elements in parallel"""

jobs = []

for j in range(1 if is_odd else 2, n, 2):

if arr[j] < arr[j - 1]:

arr[j], arr[j - 1] = arr[j - 1], arr[j]

for _ in range(n): # Run multiple passes

# Odd index pass

p1 = multiprocessing.Process(target=odd_even_pass, args=(True,))

p1.start()
p1.join()

# Even index pass

p2 = multiprocessing.Process(target=odd_even_pass, args=(False,))

p2.start()

p2.join()

# Print array

def print_array(arr):

print(" ".join(map(str, arr)))

if __name__ == "__main__":

n = 10

arr = list(range(n, 0, -1)) # Create an array from 10 to 1

arr_parallel = arr.copy() # Copy for parallel sorting

# Sequential Sorting Time

start_time = time.time()

bubble_sort(arr)

end_time = time.time()

print("Sequential Bubble Sort took:", round(end_time - start_time, 6), "seconds")

print_array(arr)

# Parallel Sorting Time

start_time = time.time()

parallel_bubble_sort(arr_parallel)

end_time = time.time()

print("Parallel Bubble Sort took:", round(end_time - start_time, 6), "seconds")


print_array(arr_parallel)

OUTPUT:

Sequential Bubble Sort took: 1e-05 seconds

1 2 3 4 5 6 7 8 9 10

Parallel Bubble Sort took: 0.108825 seconds

10 9 8 7 6 5 4 3 2 1
EXP 3

CODE:

import time

import multiprocessing

# Merge function

def merge(arr, low, mid, high):

left = arr[low:mid+1]

right = arr[mid+1:high+1]

i=j=0

k = low

while i < len(left) and j < len(right):

if left[i] <= right[j]:

arr[k] = left[i]

i += 1

else:

arr[k] = right[j]

j += 1

k += 1

while i < len(left):

arr[k] = left[i]

i += 1

k += 1
while j < len(right):

arr[k] = right[j]

j += 1

k += 1

# Sequential Merge Sort

def merge_sort(arr, low, high):

if low < high:

mid = (low + high) // 2

merge_sort(arr, low, mid)

merge_sort(arr, mid + 1, high)

merge(arr, low, mid, high)

# Parallel Merge Sort

def parallel_merge_sort(arr, low, high):

if low < high:

mid = (low + high) // 2

left_process = multiprocessing.Process(target=parallel_merge_sort, args=(arr, low, mid))

right_process = multiprocessing.Process(target=parallel_merge_sort, args=(arr, mid + 1,


high))

left_process.start()

right_process.start()

left_process.join()

right_process.join()
merge(arr, low, mid, high)

# Main function

if __name__ == "__main__":

n = 10

arr = list(range(n, 0, -1)) # Create an array from 10 to 1

arr_parallel = arr.copy() # Copy for parallel sorting

# Sequential Sorting Time

start_time = time.time()

merge_sort(arr, 0, n - 1)

end_time = time.time()

print("Time taken by sequential algorithm:", round(end_time - start_time, 6), "seconds")

# Parallel Sorting Time

start_time = time.time()

parallel_merge_sort(arr_parallel, 0, n - 1)

end_time = time.time()

print("Time taken by parallel algorithm:", round(end_time - start_time, 6), "seconds")

OUTPUT:

Time taken by sequential algorithm: 2.1e-05 seconds

Time taken by parallel algorithm: 0.126153 seconds


EXP 4

CODE:

#include <iostream>

#include <cuda_runtime.h>

using namespace std;

// CUDA Kernel for 3x3 Matrix Multiplication

_global_ void matrixMultiply(int *A, int *B, int *C) {

int row = threadIdx.y;

int col = threadIdx.x;

if (row < 3 && col < 3) {

int sum = 0;

for (int k = 0; k < 3; k++) {

sum += A[row * 3 + k] * B[k * 3 + col];

C[row * 3 + col] = sum;

// CUDA Kernel for Vector Addition (Size 3)

_global_ void vectorAdd(int *V1, int *V2, int *V3) {

int i = threadIdx.x;

if (i < 3) {

V3[i] = V1[i] + V2[i];

}
// Function to print a 3x3 matrix

void printMatrix(int *M) {

for (int i = 0; i < 3; i++) {

for (int j = 0; j < 3; j++) {

cout << M[i * 3 + j] << " ";

cout << endl;

// Function to print a vector of size 3

void printVector(int *V) {

for (int i = 0; i < 3; i++) {

cout << V[i] << " ";

cout << endl;

int main() {

// Host matrices and vectors

int h_A[3][3] = {{1, 2, 3}, {4, 5, 6}, {7, 8, 9}};

int h_B[3][3] = {{9, 8, 7}, {6, 5, 4}, {3, 2, 1}};

int h_C[3][3];

int h_V1[3] = {1, 2, 3};

int h_V2[3] = {4, 5, 6};

int h_V3[3];
// Device memory pointers

int *d_A, *d_B, *d_C;

int *d_V1, *d_V2, *d_V3;

// Allocate memory on GPU

cudaMalloc((void**)&d_A, 9 * sizeof(int));

cudaMalloc((void**)&d_B, 9 * sizeof(int));

cudaMalloc((void**)&d_C, 9 * sizeof(int));

cudaMalloc((void**)&d_V1, 3 * sizeof(int));

cudaMalloc((void**)&d_V2, 3 * sizeof(int));

cudaMalloc((void**)&d_V3, 3 * sizeof(int));

// Copy data from Host to Device

cudaMemcpy(d_A, h_A, 9 * sizeof(int), cudaMemcpyHostToDevice);

cudaMemcpy(d_B, h_B, 9 * sizeof(int), cudaMemcpyHostToDevice);

cudaMemcpy(d_V1, h_V1, 3 * sizeof(int), cudaMemcpyHostToDevice);

cudaMemcpy(d_V2, h_V2, 3 * sizeof(int), cudaMemcpyHostToDevice);

// Define CUDA execution configuration

dim3 threadsPerBlock(3, 3); // 3x3 threads for matrix multiplication

dim3 threadsPerVector(3); // 3 threads for vector addition

// Launch the kernels

matrixMultiply<<<1, threadsPerBlock>>>(d_A, d_B, d_C);

vectorAdd<<<1, threadsPerVector>>>(d_V1, d_V2, d_V3);

// Copy result back from Device to Host

cudaMemcpy(h_C, d_C, 9 * sizeof(int), cudaMemcpyDeviceToHost);

cudaMemcpy(h_V3, d_V3, 3 * sizeof(int), cudaMemcpyDeviceToHost);


// Print the results

cout << "Matrix Multiplication Result:\n";

printMatrix((int*)h_C);

cout << "\nVector Addition Result:\n";

printVector((int*)h_V3);

// Free device memory

cudaFree(d_A);

cudaFree(d_B);

cudaFree(d_C);

cudaFree(d_V1);

cudaFree(d_V2);

cudaFree(d_V3);

return 0;

OUTPUT:

Matrix Multiplication Result:

30 24 18

84 69 54

138 114 90

Vector Addition Result:

579

You might also like