0% found this document useful (0 votes)
2 views

Assignment 04 (2)

The document contains code implementations for matrix multiplication and Pi calculation using various parallel programming models including MPI, pThreads, OpenMP, and CUDA. Each section provides a distinct approach for distributed and shared memory systems, showcasing how to efficiently perform computations across multiple processes or threads. The implementations include error handling and random matrix generation, with results displayed for the top 5x5 submatrices or calculated Pi values.

Uploaded by

MianFaizan Amir
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
2 views

Assignment 04 (2)

The document contains code implementations for matrix multiplication and Pi calculation using various parallel programming models including MPI, pThreads, OpenMP, and CUDA. Each section provides a distinct approach for distributed and shared memory systems, showcasing how to efficiently perform computations across multiple processes or threads. The implementations include error handling and random matrix generation, with results displayed for the top 5x5 submatrices or calculated Pi values.

Uploaded by

MianFaizan Amir
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 16

Name: Dilawar Ali

Roll No: 107278

Task_1:
1. Using MPI (Message Passing Interface) for distributed memory systems.

#include <stdio.h>

#include <stdlib.h>

#include <mpi.h>

#include <time.h>

void fill_random(double *matrix, int size) {

for (int i = 0; i < size; i++) {

matrix[i] = (double)rand() / RAND_MAX;

void print_submatrix(double *matrix, int n) {

for (int i = 0; i < 5; i++) {

for (int j = 0; j < 5; j++) {

printf("%.2f ", matrix[i*n + j]);

printf("\n");

}
int main(int argc, char *argv[]) {

if (argc != 3) {

printf("Usage: %s <matrix_size> <num_processes>\n", argv[0]);

return 1;

int n = atoi(argv[1]);

int num_procs = atoi(argv[2]);

MPI_Init(&argc, &argv);

int rank, size;

MPI_Comm_rank(MPI_COMM_WORLD, &rank);

MPI_Comm_size(MPI_COMM_WORLD, &size);

if (size != num_procs) {

if (rank == 0) printf("Error: Requested %d processes but got %d\n", num_procs, size);

MPI_Finalize();

return 1;

if (n % size != 0) {

if (rank == 0) printf("Error: Matrix size must be divisible by number of processes\n");

MPI_Finalize();

return 1;

}
double *A = NULL, *B = NULL, *C = NULL;

double *local_A = malloc(n * n / size * sizeof(double));

double *local_C = malloc(n * n / size * sizeof(double));

B = malloc(n * n * sizeof(double));

if (rank == 0) {

A = malloc(n * n * sizeof(double));

C = malloc(n * n * sizeof(double));

srand(time(NULL));

fill_random(A, n*n);

fill_random(B, n*n);

MPI_Scatter(A, n*n/size, MPI_DOUBLE, local_A, n*n/size, MPI_DOUBLE, 0,


MPI_COMM_WORLD);

MPI_Bcast(B, n*n, MPI_DOUBLE, 0, MPI_COMM_WORLD);

int rows_per_proc = n / size;

for (int i = 0; i < rows_per_proc; i++) {

for (int j = 0; j < n; j++) {

local_C[i*n + j] = 0;

for (int k = 0; k < n; k++) {

local_C[i*n + j] += local_A[i*n + k] * B[k*n + j];

}
MPI_Gather(local_C, n*n/size, MPI_DOUBLE, C, n*n/size, MPI_DOUBLE, 0,
MPI_COMM_WORLD);

if (rank == 0) {

printf("Result (top 5x5):\n");

print_submatrix(C, n);

free(A); free(C);

free(local_A); free(local_C); free(B);

MPI_Finalize();

return 0;

2. Using pThreads (Posix Threads) for Shared Memory systems.

#include <stdio.h>

#include <stdlib.h>

#include <pthread.h>

#include <time.h>

#define MAX_THREADS 1024

int n, num_threads;

double *A, *B, *C;

typedef struct {
int start_row;

int end_row;

} ThreadData;

void fill_random(double *matrix, int size) {

for (int i = 0; i < size; i++) {

matrix[i] = (double)rand() / RAND_MAX;

void print_submatrix(double *matrix) {

for (int i = 0; i < 5; i++) {

for (int j = 0; j < 5; j++) {

printf("%.2f ", matrix[i*n + j]);

printf("\n");

void *matmul_thread(void *arg) {

ThreadData *data = (ThreadData *)arg;

for (int i = data->start_row; i < data->end_row; i++) {

for (int j = 0; j < n; j++) {

C[i*n + j] = 0;

for (int k = 0; k < n; k++) {

C[i*n + j] += A[i*n + k] * B[k*n + j];


}

pthread_exit(NULL);

int main(int argc, char *argv[]) {

if (argc != 3) {

printf("Usage: %s <matrix_size> <num_threads>\n", argv[0]);

return 1;

n = atoi(argv[1]);

num_threads = atoi(argv[2]);

A = malloc(n * n * sizeof(double));

B = malloc(n * n * sizeof(double));

C = malloc(n * n * sizeof(double));

srand(time(NULL));

fill_random(A, n*n);

fill_random(B, n*n);

pthread_t threads[MAX_THREADS];

ThreadData thread_data[MAX_THREADS];

int rows_per_thread = n / num_threads;


for (int i = 0; i < num_threads; i++) {

thread_data[i].start_row = i * rows_per_thread;

thread_data[i].end_row = (i+1) * rows_per_thread;

pthread_create(&threads[i], NULL, matmul_thread, &thread_data[i]);

for (int i = 0; i < num_threads; i++) {

pthread_join(threads[i], NULL);

printf("Result (top 5x5):\n");

print_submatrix(C);

free(A); free(B); free(C);

return 0;

3. Using OpenMP for Shared Memory systems.

#include <stdio.h>

#include <stdlib.h>

#include <time.h>

#include <omp.h>

void fill_random(double *matrix, int size) {

#pragma omp parallel for


for (int i = 0; i < size; i++) {

matrix[i] = (double)rand() / RAND_MAX;

void print_submatrix(double *matrix, int n) {

for (int i = 0; i < 5; i++) {

for (int j = 0; j < 5; j++) {

printf("%.2f ", matrix[i*n + j]);

printf("\n");

int main(int argc, char *argv[]) {

if (argc != 3) {

printf("Usage: %s <matrix_size> <num_threads>\n", argv[0]);

return 1;

int n = atoi(argv[1]);

int num_threads = atoi(argv[2]);

omp_set_num_threads(num_threads);

double *A = malloc(n * n * sizeof(double));

double *B = malloc(n * n * sizeof(double));


double *C = malloc(n * n * sizeof(double));

srand(time(NULL));

fill_random(A, n*n);

fill_random(B, n*n);

#pragma omp parallel for collapse(2)

for (int i = 0; i < n; i++) {

for (int j = 0; j < n; j++) {

C[i*n + j] = 0;

for (int k = 0; k < n; k++) {

C[i*n + j] += A[i*n + k] * B[k*n + j];

printf("Result (top 5x5):\n");

print_submatrix(C, n);

free(A); free(B); free(C);

return 0;

4. Using CUDA for GPU based systems.

#include <stdio.h>

#include <stdlib.h>
#include <time.h>

#include <cuda_runtime.h>

#define BLOCK_SIZE 16

__global__ void matmul_kernel(double *A, double *B, double *C, int n) {

int row = blockIdx.y * blockDim.y + threadIdx.y;

int col = blockIdx.x * blockDim.x + threadIdx.x;

if (row < n && col < n) {

double sum = 0;

for (int k = 0; k < n; k++) {

sum += A[row*n + k] * B[k*n + col];

C[row*n + col] = sum;

void fill_random(double *matrix, int size) {

for (int i = 0; i < size; i++) {

matrix[i] = (double)rand() / RAND_MAX;

void print_submatrix(double *matrix, int n) {

for (int i = 0; i < 5; i++) {


for (int j = 0; j < 5; j++) {

printf("%.2f ", matrix[i*n + j]);

printf("\n");

int main(int argc, char *argv[]) {

if (argc != 3) {

printf("Usage: %s <matrix_size> <num_blocks>\n", argv[0]);

return 1;

int n = atoi(argv[1]);

int num_blocks = atoi(argv[2]);

double *A = (double*)malloc(n * n * sizeof(double));

double *B = (double*)malloc(n * n * sizeof(double));

double *C = (double*)malloc(n * n * sizeof(double));

srand(time(NULL));

fill_random(A, n*n);

fill_random(B, n*n);

double *d_A, *d_B, *d_C;

cudaMalloc(&d_A, n*n*sizeof(double));
cudaMalloc(&d_B, n*n*sizeof(double));

cudaMalloc(&d_C, n*n*sizeof(double));

cudaMemcpy(d_A, A, n*n*sizeof(double), cudaMemcpyHostToDevice);

cudaMemcpy(d_B, B, n*n*sizeof(double), cudaMemcpyHostToDevice);

dim3 blockDim(BLOCK_SIZE, BLOCK_SIZE);

dim3 gridDim((n + BLOCK_SIZE - 1) / BLOCK_SIZE,

(n + BLOCK_SIZE - 1) / BLOCK_SIZE);

matmul_kernel<<<gridDim, blockDim>>>(d_A, d_B, d_C, n);

cudaMemcpy(C, d_C, n*n*sizeof(double), cudaMemcpyDeviceToHost);

printf("Result (top 5x5):\n");

print_submatrix(C, n);

free(A); free(B); free(C);

cudaFree(d_A); cudaFree(d_B); cudaFree(d_C);

return 0;

Task_2:
1. Using MPI (Message Passing Interface) for distributed memory systems.

#include <stdio.h>

#include <stdlib.h>
#include <mpi.h>

int main(int argc, char *argv[]) {

if (argc != 3) {

printf("Usage: %s <num_terms> <num_processes>\n", argv[0]);

return 1;

long long num_terms = atoll(argv[1]);

int num_procs = atoi(argv[2]);

MPI_Init(&argc, &argv);

int rank, size;

MPI_Comm_rank(MPI_COMM_WORLD, &rank);

MPI_Comm_size(MPI_COMM_WORLD, &size);

if (size != num_procs) {

if (rank == 0) printf("Error: Requested %d processes but got %d\n", num_procs, size);

MPI_Finalize();

return 1;

long long terms_per_proc = num_terms / size;

long long start = rank * terms_per_proc;

long long end = (rank + 1) * terms_per_proc;

if (rank == size - 1) end = num_terms;


double local_sum = 0.0;

for (long long i = start; i < end; i++) {

double term = 1.0 / (2 * i + 1);

if (i % 2 == 1) term = -term;

local_sum += term;

double global_sum;

MPI_Reduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, 0,


MPI_COMM_WORLD);

if (rank == 0) {

double pi = 4.0 * global_sum;

printf("Calculated PI: %.15f\n", pi);

MPI_Finalize();

return 0;

3. Using OpenMP for Shared Memory systems.

#include <stdio.h>

#include <stdlib.h>
#include <omp.h>

int main(int argc, char *argv[]) {

if (argc != 3) {

printf("Usage: %s <num_terms> <num_threads>\n", argv[0]);

return 1;

long long num_terms = atoll(argv[1]);

int num_threads = atoi(argv[2]);

double sum = 0.0;

omp_set_num_threads(num_threads);

#pragma omp parallel for reduction(+:sum)

for (long long i = 0; i < num_terms; i++) {

double term = 1.0 / (2 * i + 1);

if (i % 2 == 1) term = -term;

sum += term;

double pi = 4.0 * sum;

printf("Calculated PI: %.15f\n", pi);

return 0;

You might also like