0% found this document useful (0 votes)
0 views

HPC_codes

The document contains multiple code examples demonstrating parallel algorithms in Python and CUDA for graph traversal (DFS and BFS), sorting (Bubble Sort and Merge Sort), and matrix/vector operations. Each example includes a sequential and parallel implementation, along with timing comparisons for performance evaluation. The outputs of the algorithms are also provided, showcasing the results of the operations performed.
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
0 views

HPC_codes

The document contains multiple code examples demonstrating parallel algorithms in Python and CUDA for graph traversal (DFS and BFS), sorting (Bubble Sort and Merge Sort), and matrix/vector operations. Each example includes a sequential and parallel implementation, along with timing comparisons for performance evaluation. The outputs of the algorithms are also provided, showcasing the results of the operations performed.
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 14

EXP1

Code:

import threading

from queue import Queue

class Graph:

def __init__(self, V):

self.V = V

self.adj = [[] for _ in range(V)]

self.lock = threading.Lock() # Lock for thread safety

def add_edge(self, v, w):

self.adj[v].append(w)

# Parallel Depth-First Search

def parallel_dfs(self, start_vertex):

visited = [False] * self.V

threads = []

def dfs_util(v):

with self.lock:

if visited[v]:

return

visited[v] = True

print(v, end=" ")

# Start threads for each adjacent node

local_threads = []
for n in self.adj[v]:

if not visited[n]:

t = threading.Thread(target=dfs_util, args=(n,))

local_threads.append(t)

t.start()

# Wait for all threads to finish

for t in local_threads:

t.join()

# Start DFS from the initial node

t = threading.Thread(target=dfs_util, args=(start_vertex,))

threads.append(t)

t.start()

for t in threads:

t.join()

# Parallel Breadth-First Search

def parallel_bfs(self, start_vertex):

visited = [False] * self.V

q = Queue()

q.put(start_vertex)

visited[start_vertex] = True

while not q.empty():

q_size = q.qsize()

threads = []
def process_node():

while not q.empty():

v = q.get()

print(v, end=" ")

# Add adjacent nodes to the queue

for n in self.adj[v]:

with self.lock:

if not visited[n]:

visited[n] = True

q.put(n)

# Launch multiple threads to process BFS in parallel

for _ in range(q_size):

t = threading.Thread(target=process_node)

threads.append(t)

t.start()

for t in threads:

t.join()

# Create a graph

g = Graph(7)

g.add_edge(0, 1)

g.add_edge(0, 2)

g.add_edge(1, 3)

g.add_edge(1, 4)
g.add_edge(2, 5)

g.add_edge(2, 6)

"""

0 -------->1

| /\

| / \

| / \

v v v

2 ----> 3 4

| |

| |

v v

5 6

"""

print("Depth-First Search (DFS): ", end="")

g.parallel_dfs(0)

print("\nBreadth-First Search (BFS): ", end="")

g.parallel_bfs(0)

print()

OUTPUT:

Depth-First Search (DFS): 0 1 3 2 4 5 6

Breadth-First Search (BFS): 0 1 2 3 4 5 6


EXP 2

CODE:

import time

import multiprocessing

# Sequential Bubble Sort

def bubble_sort(arr):

n = len(arr)

for i in range(n - 1):

for j in range(n - i - 1):

if arr[j] > arr[j + 1]:

arr[j], arr[j + 1] = arr[j + 1], arr[j]

# Parallel Odd-Even Bubble Sort

def parallel_bubble_sort(arr):

n = len(arr)

def odd_even_pass(is_odd):

"""Sort odd or even indexed elements in parallel"""

jobs = []

for j in range(1 if is_odd else 2, n, 2):

if arr[j] < arr[j - 1]:

arr[j], arr[j - 1] = arr[j - 1], arr[j]

for _ in range(n): # Run multiple passes

# Odd index pass

p1 = multiprocessing.Process(target=odd_even_pass, args=(True,))

p1.start()
p1.join()

# Even index pass

p2 = multiprocessing.Process(target=odd_even_pass, args=(False,))

p2.start()

p2.join()

# Print array

def print_array(arr):

print(" ".join(map(str, arr)))

if __name__ == "__main__":

n = 10

arr = list(range(n, 0, -1)) # Create an array from 10 to 1

arr_parallel = arr.copy() # Copy for parallel sorting

# Sequential Sorting Time

start_time = time.time()

bubble_sort(arr)

end_time = time.time()

print("Sequential Bubble Sort took:", round(end_time - start_time, 6), "seconds")

print_array(arr)

# Parallel Sorting Time

start_time = time.time()

parallel_bubble_sort(arr_parallel)

end_time = time.time()

print("Parallel Bubble Sort took:", round(end_time - start_time, 6), "seconds")


print_array(arr_parallel)

OUTPUT:

Sequential Bubble Sort took: 1e-05 seconds

1 2 3 4 5 6 7 8 9 10

Parallel Bubble Sort took: 0.108825 seconds

10 9 8 7 6 5 4 3 2 1
EXP 3

CODE:

import time

import multiprocessing

# Merge function

def merge(arr, low, mid, high):

left = arr[low:mid+1]

right = arr[mid+1:high+1]

i=j=0

k = low

while i < len(left) and j < len(right):

if left[i] <= right[j]:

arr[k] = left[i]

i += 1

else:

arr[k] = right[j]

j += 1

k += 1

while i < len(left):

arr[k] = left[i]

i += 1

k += 1
while j < len(right):

arr[k] = right[j]

j += 1

k += 1

# Sequential Merge Sort

def merge_sort(arr, low, high):

if low < high:

mid = (low + high) // 2

merge_sort(arr, low, mid)

merge_sort(arr, mid + 1, high)

merge(arr, low, mid, high)

# Parallel Merge Sort

def parallel_merge_sort(arr, low, high):

if low < high:

mid = (low + high) // 2

left_process = multiprocessing.Process(target=parallel_merge_sort, args=(arr, low, mid))

right_process = multiprocessing.Process(target=parallel_merge_sort, args=(arr, mid + 1,


high))

left_process.start()

right_process.start()

left_process.join()

right_process.join()
merge(arr, low, mid, high)

# Main function

if __name__ == "__main__":

n = 10

arr = list(range(n, 0, -1)) # Create an array from 10 to 1

arr_parallel = arr.copy() # Copy for parallel sorting

# Sequential Sorting Time

start_time = time.time()

merge_sort(arr, 0, n - 1)

end_time = time.time()

print("Time taken by sequential algorithm:", round(end_time - start_time, 6), "seconds")

# Parallel Sorting Time

start_time = time.time()

parallel_merge_sort(arr_parallel, 0, n - 1)

end_time = time.time()

print("Time taken by parallel algorithm:", round(end_time - start_time, 6), "seconds")

OUTPUT:

Time taken by sequential algorithm: 2.1e-05 seconds

Time taken by parallel algorithm: 0.126153 seconds


EXP 4

CODE:

#include <iostream>

#include <cuda_runtime.h>

using namespace std;

// CUDA Kernel for 3x3 Matrix Multiplication

_global_ void matrixMultiply(int *A, int *B, int *C) {

int row = threadIdx.y;

int col = threadIdx.x;

if (row < 3 && col < 3) {

int sum = 0;

for (int k = 0; k < 3; k++) {

sum += A[row * 3 + k] * B[k * 3 + col];

C[row * 3 + col] = sum;

// CUDA Kernel for Vector Addition (Size 3)

_global_ void vectorAdd(int *V1, int *V2, int *V3) {

int i = threadIdx.x;

if (i < 3) {

V3[i] = V1[i] + V2[i];

}
// Function to print a 3x3 matrix

void printMatrix(int *M) {

for (int i = 0; i < 3; i++) {

for (int j = 0; j < 3; j++) {

cout << M[i * 3 + j] << " ";

cout << endl;

// Function to print a vector of size 3

void printVector(int *V) {

for (int i = 0; i < 3; i++) {

cout << V[i] << " ";

cout << endl;

int main() {

// Host matrices and vectors

int h_A[3][3] = {{1, 2, 3}, {4, 5, 6}, {7, 8, 9}};

int h_B[3][3] = {{9, 8, 7}, {6, 5, 4}, {3, 2, 1}};

int h_C[3][3];

int h_V1[3] = {1, 2, 3};

int h_V2[3] = {4, 5, 6};

int h_V3[3];
// Device memory pointers

int *d_A, *d_B, *d_C;

int *d_V1, *d_V2, *d_V3;

// Allocate memory on GPU

cudaMalloc((void**)&d_A, 9 * sizeof(int));

cudaMalloc((void**)&d_B, 9 * sizeof(int));

cudaMalloc((void**)&d_C, 9 * sizeof(int));

cudaMalloc((void**)&d_V1, 3 * sizeof(int));

cudaMalloc((void**)&d_V2, 3 * sizeof(int));

cudaMalloc((void**)&d_V3, 3 * sizeof(int));

// Copy data from Host to Device

cudaMemcpy(d_A, h_A, 9 * sizeof(int), cudaMemcpyHostToDevice);

cudaMemcpy(d_B, h_B, 9 * sizeof(int), cudaMemcpyHostToDevice);

cudaMemcpy(d_V1, h_V1, 3 * sizeof(int), cudaMemcpyHostToDevice);

cudaMemcpy(d_V2, h_V2, 3 * sizeof(int), cudaMemcpyHostToDevice);

// Define CUDA execution configuration

dim3 threadsPerBlock(3, 3); // 3x3 threads for matrix multiplication

dim3 threadsPerVector(3); // 3 threads for vector addition

// Launch the kernels

matrixMultiply<<<1, threadsPerBlock>>>(d_A, d_B, d_C);

vectorAdd<<<1, threadsPerVector>>>(d_V1, d_V2, d_V3);

// Copy result back from Device to Host

cudaMemcpy(h_C, d_C, 9 * sizeof(int), cudaMemcpyDeviceToHost);

cudaMemcpy(h_V3, d_V3, 3 * sizeof(int), cudaMemcpyDeviceToHost);


// Print the results

cout << "Matrix Multiplication Result:\n";

printMatrix((int*)h_C);

cout << "\nVector Addition Result:\n";

printVector((int*)h_V3);

// Free device memory

cudaFree(d_A);

cudaFree(d_B);

cudaFree(d_C);

cudaFree(d_V1);

cudaFree(d_V2);

cudaFree(d_V3);

return 0;

OUTPUT:

Matrix Multiplication Result:

30 24 18

84 69 54

138 114 90

Vector Addition Result:

579

You might also like