0% found this document useful (0 votes)

2 views

Assignment 04 (2)

The document contains code implementations for matrix multiplication and Pi calculation using various parallel programming models including MPI, pThreads, OpenMP, and CUDA. Each section provides a distinct approach for distributed and shared memory systems, showcasing how to efficiently perform computations across multiple processes or threads. The implementations include error handling and random matrix generation, with results displayed for the top 5x5 submatrices or calculated Pi values.

Uploaded by

MianFaizan Amir

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

2 views

Assignment 04 (2)

Uploaded by

MianFaizan Amir

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 16

Name: Dilawar Ali

Roll No: 107278

Task_1:
1. Using MPI (Message Passing Interface) for distributed memory systems.

#include <stdio.h>

#include <stdlib.h>

#include <mpi.h>

#include <time.h>

void fill_random(double *matrix, int size) {

for (int i = 0; i < size; i++) {

matrix[i] = (double)rand() / RAND_MAX;

void print_submatrix(double *matrix, int n) {

for (int i = 0; i < 5; i++) {

for (int j = 0; j < 5; j++) {

printf("%.2f ", matrix[i*n + j]);

printf("\n");

}
int main(int argc, char *argv[]) {

if (argc != 3) {

printf("Usage: %s <matrix_size> <num_processes>\n", argv[0]);

return 1;

int n = atoi(argv[1]);

int num_procs = atoi(argv[2]);

MPI_Init(&argc, &argv);

int rank, size;

MPI_Comm_rank(MPI_COMM_WORLD, &rank);

MPI_Comm_size(MPI_COMM_WORLD, &size);

if (size != num_procs) {

if (rank == 0) printf("Error: Requested %d processes but got %d\n", num_procs, size);

MPI_Finalize();

return 1;

if (n % size != 0) {

if (rank == 0) printf("Error: Matrix size must be divisible by number of processes\n");

MPI_Finalize();

return 1;

}
double *A = NULL, *B = NULL, *C = NULL;

double local_A = malloc(n n / size * sizeof(double));

double local_C = malloc(n n / size * sizeof(double));

B = malloc(n * n * sizeof(double));

if (rank == 0) {

A = malloc(n * n * sizeof(double));

C = malloc(n * n * sizeof(double));

srand(time(NULL));

fill_random(A, n*n);

fill_random(B, n*n);

MPI_Scatter(A, nn/size, MPI_DOUBLE, local_A, nn/size, MPI_DOUBLE, 0,

MPI_COMM_WORLD);

MPI_Bcast(B, n*n, MPI_DOUBLE, 0, MPI_COMM_WORLD);

int rows_per_proc = n / size;

for (int i = 0; i < rows_per_proc; i++) {

for (int j = 0; j < n; j++) {

local_C[i*n + j] = 0;

for (int k = 0; k < n; k++) {

local_C[in + j] += local_A[in + k] * B[k*n + j];

}
MPI_Gather(local_C, n*n/size, MPI_DOUBLE, C, n*n/size, MPI_DOUBLE, 0,
MPI_COMM_WORLD);

if (rank == 0) {

printf("Result (top 5x5):\n");

print_submatrix(C, n);

free(A); free(C);

free(local_A); free(local_C); free(B);

MPI_Finalize();

return 0;

2. Using pThreads (Posix Threads) for Shared Memory systems.

#include <stdio.h>

#include <stdlib.h>

#include <pthread.h>

#include <time.h>

#define MAX_THREADS 1024

int n, num_threads;

double A, B, *C;

typedef struct {
int start_row;

int end_row;

} ThreadData;

void fill_random(double *matrix, int size) {

for (int i = 0; i < size; i++) {

matrix[i] = (double)rand() / RAND_MAX;

void print_submatrix(double *matrix) {

for (int i = 0; i < 5; i++) {

for (int j = 0; j < 5; j++) {

printf("%.2f ", matrix[i*n + j]);

printf("\n");

void matmul_thread(void arg) {

ThreadData data = (ThreadData )arg;

for (int i = data->start_row; i < data->end_row; i++) {

for (int j = 0; j < n; j++) {

C[i*n + j] = 0;

for (int k = 0; k < n; k++) {

C[in + j] += A[in + k] * B[k*n + j];

}

pthread_exit(NULL);

int main(int argc, char *argv[]) {

if (argc != 3) {

printf("Usage: %s <matrix_size> <num_threads>\n", argv[0]);

return 1;

n = atoi(argv[1]);

num_threads = atoi(argv[2]);

A = malloc(n * n * sizeof(double));

B = malloc(n * n * sizeof(double));

C = malloc(n * n * sizeof(double));

srand(time(NULL));

fill_random(A, n*n);

fill_random(B, n*n);

pthread_t threads[MAX_THREADS];

ThreadData thread_data[MAX_THREADS];

int rows_per_thread = n / num_threads;

for (int i = 0; i < num_threads; i++) {

thread_data[i].start_row = i * rows_per_thread;

thread_data[i].end_row = (i+1) * rows_per_thread;

pthread_create(&threads[i], NULL, matmul_thread, &thread_data[i]);

for (int i = 0; i < num_threads; i++) {

pthread_join(threads[i], NULL);

printf("Result (top 5x5):\n");

print_submatrix(C);

free(A); free(B); free(C);

return 0;

3. Using OpenMP for Shared Memory systems.

#include <stdio.h>

#include <stdlib.h>

#include <time.h>

#include <omp.h>

void fill_random(double *matrix, int size) {

#pragma omp parallel for

for (int i = 0; i < size; i++) {

matrix[i] = (double)rand() / RAND_MAX;

void print_submatrix(double *matrix, int n) {

for (int i = 0; i < 5; i++) {

for (int j = 0; j < 5; j++) {

printf("%.2f ", matrix[i*n + j]);

printf("\n");

int main(int argc, char *argv[]) {

if (argc != 3) {

printf("Usage: %s <matrix_size> <num_threads>\n", argv[0]);

return 1;

int n = atoi(argv[1]);

int num_threads = atoi(argv[2]);

omp_set_num_threads(num_threads);

double A = malloc(n n * sizeof(double));

double B = malloc(n n * sizeof(double));

double *C = malloc(n * n * sizeof(double));

srand(time(NULL));

fill_random(A, n*n);

fill_random(B, n*n);

#pragma omp parallel for collapse(2)

for (int i = 0; i < n; i++) {

for (int j = 0; j < n; j++) {

C[i*n + j] = 0;

for (int k = 0; k < n; k++) {

C[in + j] += A[in + k] * B[k*n + j];

printf("Result (top 5x5):\n");

print_submatrix(C, n);

free(A); free(B); free(C);

return 0;

4. Using CUDA for GPU based systems.

#include <stdio.h>

#include <stdlib.h>
#include <time.h>

#include <cuda_runtime.h>

#define BLOCK_SIZE 16

global void matmul_kernel(double A, double B, double *C, int n) {

int row = blockIdx.y * blockDim.y + threadIdx.y;

int col = blockIdx.x * blockDim.x + threadIdx.x;

if (row < n && col < n) {

double sum = 0;

for (int k = 0; k < n; k++) {

sum += A[rown + k] B[k*n + col];

C[row*n + col] = sum;

void fill_random(double *matrix, int size) {

for (int i = 0; i < size; i++) {

matrix[i] = (double)rand() / RAND_MAX;

void print_submatrix(double *matrix, int n) {

for (int i = 0; i < 5; i++) {

for (int j = 0; j < 5; j++) {

printf("%.2f ", matrix[i*n + j]);

printf("\n");

int main(int argc, char *argv[]) {

if (argc != 3) {

printf("Usage: %s <matrix_size> <num_blocks>\n", argv[0]);

return 1;

int n = atoi(argv[1]);

int num_blocks = atoi(argv[2]);

double A = (double)malloc(n * n * sizeof(double));

double B = (double)malloc(n * n * sizeof(double));

double C = (double)malloc(n * n * sizeof(double));

srand(time(NULL));

fill_random(A, n*n);

fill_random(B, n*n);

double d_A, d_B, *d_C;

cudaMalloc(&d_A, n*n*sizeof(double));
cudaMalloc(&d_B, n*n*sizeof(double));

cudaMalloc(&d_C, n*n*sizeof(double));

cudaMemcpy(d_A, A, nnsizeof(double), cudaMemcpyHostToDevice);

cudaMemcpy(d_B, B, nnsizeof(double), cudaMemcpyHostToDevice);

dim3 blockDim(BLOCK_SIZE, BLOCK_SIZE);

dim3 gridDim((n + BLOCK_SIZE - 1) / BLOCK_SIZE,

(n + BLOCK_SIZE - 1) / BLOCK_SIZE);

matmul_kernel<<<gridDim, blockDim>>>(d_A, d_B, d_C, n);

cudaMemcpy(C, d_C, nnsizeof(double), cudaMemcpyDeviceToHost);

printf("Result (top 5x5):\n");

print_submatrix(C, n);

free(A); free(B); free(C);

cudaFree(d_A); cudaFree(d_B); cudaFree(d_C);

return 0;

Task_2:
1. Using MPI (Message Passing Interface) for distributed memory systems.

#include <stdio.h>

#include <stdlib.h>
#include <mpi.h>

int main(int argc, char *argv[]) {

if (argc != 3) {

printf("Usage: %s <num_terms> <num_processes>\n", argv[0]);

return 1;

long long num_terms = atoll(argv[1]);

int num_procs = atoi(argv[2]);

MPI_Init(&argc, &argv);

int rank, size;

MPI_Comm_rank(MPI_COMM_WORLD, &rank);

MPI_Comm_size(MPI_COMM_WORLD, &size);

if (size != num_procs) {

if (rank == 0) printf("Error: Requested %d processes but got %d\n", num_procs, size);

MPI_Finalize();

return 1;

long long terms_per_proc = num_terms / size;

long long start = rank * terms_per_proc;

long long end = (rank + 1) * terms_per_proc;

if (rank == size - 1) end = num_terms;

double local_sum = 0.0;

for (long long i = start; i < end; i++) {

double term = 1.0 / (2 * i + 1);

if (i % 2 == 1) term = -term;

local_sum += term;

double global_sum;

MPI_Reduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, 0,

MPI_COMM_WORLD);

if (rank == 0) {

double pi = 4.0 * global_sum;

printf("Calculated PI: %.15f\n", pi);

MPI_Finalize();

return 0;

3. Using OpenMP for Shared Memory systems.

#include <stdio.h>

#include <stdlib.h>
#include <omp.h>

int main(int argc, char *argv[]) {

if (argc != 3) {

printf("Usage: %s <num_terms> <num_threads>\n", argv[0]);

return 1;

long long num_terms = atoll(argv[1]);

int num_threads = atoi(argv[2]);

double sum = 0.0;

omp_set_num_threads(num_threads);

#pragma omp parallel for reduction(+:sum)

for (long long i = 0; i < num_terms; i++) {

double term = 1.0 / (2 * i + 1);

if (i % 2 == 1) term = -term;

sum += term;

double pi = 4.0 * sum;

printf("Calculated PI: %.15f\n", pi);

return 0;

Migrating From MQL4 To MQL5 - MQL5 Articles
100% (1)
Migrating From MQL4 To MQL5 - MQL5 Articles
29 pages
EXERCISE- 4[1] (1)
No ratings yet
EXERCISE- 4[1] (1)
8 pages
Gauss
No ratings yet
Gauss
7 pages
OpenMP Matrix
No ratings yet
OpenMP Matrix
6 pages
22l-6819
No ratings yet
22l-6819
8 pages
20 Quiz 14
No ratings yet
20 Quiz 14
12 pages
22l-6831
No ratings yet
22l-6831
9 pages
Untitled document
No ratings yet
Untitled document
23 pages
EXERCISE- 4
No ratings yet
EXERCISE- 4
8 pages
BECOA157 Parallel Matrix Multiplication
No ratings yet
BECOA157 Parallel Matrix Multiplication
3 pages
hpcExp3 1832
No ratings yet
hpcExp3 1832
3 pages
OpenMP Programs
No ratings yet
OpenMP Programs
4 pages
#Include #Include #Define
No ratings yet
#Include #Include #Define
8 pages
Ex7 Code
No ratings yet
Ex7 Code
3 pages
Lab 3
No ratings yet
Lab 3
23 pages
E 3 (Openmp - Iii) : Matrix Multiplication
No ratings yet
E 3 (Openmp - Iii) : Matrix Multiplication
10 pages
Lab 7
No ratings yet
Lab 7
3 pages
Lab5 Mat Ops Pthreads 11
No ratings yet
Lab5 Mat Ops Pthreads 11
6 pages
As 3
No ratings yet
As 3
2 pages
HPC Int2 Key
No ratings yet
HPC Int2 Key
10 pages
Parallel Block-Oriented Matrix Multiplication
No ratings yet
Parallel Block-Oriented Matrix Multiplication
5 pages
Pseudo Code of Mpi Programs
No ratings yet
Pseudo Code of Mpi Programs
22 pages
lab9pdchhhggffffddd
No ratings yet
lab9pdchhhggffffddd
4 pages
gauravkumar_221it027@it301_Lab2
No ratings yet
gauravkumar_221it027@it301_Lab2
28 pages
TP2
No ratings yet
TP2
4 pages
HPC Programs
No ratings yet
HPC Programs
19 pages
Input: Output: 1. Sub String Program
No ratings yet
Input: Output: 1. Sub String Program
8 pages
CP4292-MCAP(1)
No ratings yet
CP4292-MCAP(1)
15 pages
Department of Computer Engineering BE Laboratory Practice-I A.Y 2021-22 SEM1
No ratings yet
Department of Computer Engineering BE Laboratory Practice-I A.Y 2021-22 SEM1
45 pages
5-computation
No ratings yet
5-computation
13 pages
Par - 1 In-Term Exam - Course 2017/18-Q2
No ratings yet
Par - 1 In-Term Exam - Course 2017/18-Q2
7 pages
77a3d882-bc70-4699-a880-d8bd3ce01411
No ratings yet
77a3d882-bc70-4699-a880-d8bd3ce01411
24 pages
My Experiments: Opencl Gpu Matrix Multiplication Program
No ratings yet
My Experiments: Opencl Gpu Matrix Multiplication Program
19 pages
Computer Engineering Laboratory Solution Primer
From Everand
Computer Engineering Laboratory Solution Primer
Karan Bhandari
No ratings yet
217 Lec3
No ratings yet
217 Lec3
46 pages
Matrix Mult
100% (1)
Matrix Mult
55 pages
MAP lab completed doc
No ratings yet
MAP lab completed doc
29 pages
Ee8218 Lab2
No ratings yet
Ee8218 Lab2
7 pages
OpenAcc Assignment Questions
No ratings yet
OpenAcc Assignment Questions
11 pages
333063_OSSP Assignment 3
No ratings yet
333063_OSSP Assignment 3
7 pages
MAP laB mannual
No ratings yet
MAP laB mannual
24 pages
Lab # 2 by Akram
No ratings yet
Lab # 2 by Akram
14 pages
Exercise 1 (Openmp-I)
No ratings yet
Exercise 1 (Openmp-I)
10 pages
Final PDC Exam
No ratings yet
Final PDC Exam
10 pages
vertopal.com_Lab7_GPU (1)
No ratings yet
vertopal.com_Lab7_GPU (1)
10 pages
Ass Parallel
No ratings yet
Ass Parallel
11 pages
Rishi
No ratings yet
Rishi
30 pages
LP 1,,1
No ratings yet
LP 1,,1
5 pages
HPC MPI LAB 1 Vector Addition
No ratings yet
HPC MPI LAB 1 Vector Addition
9 pages
Untitled document
No ratings yet
Untitled document
23 pages
2 6
No ratings yet
2 6
8 pages
Lab Report 6
No ratings yet
Lab Report 6
12 pages
PDC Lab 2-5
No ratings yet
PDC Lab 2-5
5 pages
Matrix Multiplication Parallel
No ratings yet
Matrix Multiplication Parallel
5 pages
Web GPU
0% (1)
Web GPU
40 pages
PDC Experiments
No ratings yet
PDC Experiments
11 pages
Vector Addition: Exercise 1 (Openmp-I) Scenario - I
100% (1)
Vector Addition: Exercise 1 (Openmp-I) Scenario - I
15 pages
HPC La2-2023qs
No ratings yet
HPC La2-2023qs
5 pages
Parallel Computing Lab Manual PDF
100% (1)
Parallel Computing Lab Manual PDF
51 pages
HPC Codes-2
No ratings yet
HPC Codes-2
15 pages
C Language Programming Codes
From Everand
C Language Programming Codes
Durgesh
No ratings yet
Design, Implementation, and Evaluation of An XG-PON Module For ns-3 Network Simulator
No ratings yet
Design, Implementation, and Evaluation of An XG-PON Module For ns-3 Network Simulator
15 pages
CC Build
No ratings yet
CC Build
202 pages
What Is All This Error Budget Stuff Anyway
No ratings yet
What Is All This Error Budget Stuff Anyway
4 pages
CL-NG-6460-002-085 Checklist For Relay & Control Panel Scheme Check and Function Test Rev01-1
No ratings yet
CL-NG-6460-002-085 Checklist For Relay & Control Panel Scheme Check and Function Test Rev01-1
6 pages
FlexNet Licensing Administrator Guide 211 Enu
No ratings yet
FlexNet Licensing Administrator Guide 211 Enu
106 pages
HANA Update and Migration Guide 1.0 To 2.0 Version 1-0-0
No ratings yet
HANA Update and Migration Guide 1.0 To 2.0 Version 1-0-0
22 pages
DX Diag
No ratings yet
DX Diag
30 pages
Techdata - TP-Link TL-WR840N v6
No ratings yet
Techdata - TP-Link TL-WR840N v6
4 pages
Unit 2 - Radar Engineering - WWW - Rgpvnotes.in
No ratings yet
Unit 2 - Radar Engineering - WWW - Rgpvnotes.in
16 pages
Schneider Electric - Modicon-Easy-M200 - TM200CE24T
No ratings yet
Schneider Electric - Modicon-Easy-M200 - TM200CE24T
13 pages
Sudheer Oracle.
No ratings yet
Sudheer Oracle.
7 pages
Automotive Microcontroller
No ratings yet
Automotive Microcontroller
5 pages
CARNET
No ratings yet
CARNET
7 pages
Product Datasheet
No ratings yet
Product Datasheet
2 pages
Ky Thuat Lap Trinh 9858
No ratings yet
Ky Thuat Lap Trinh 9858
23 pages
Citra Log
No ratings yet
Citra Log
13 pages
Instant Download Programming WCF Services 4th Edition Juval Löwy PDF All Chapter
100% (5)
Instant Download Programming WCF Services 4th Edition Juval Löwy PDF All Chapter
62 pages
Switch
No ratings yet
Switch
2 pages
Acmer p1 Lasergrbl User Manual
No ratings yet
Acmer p1 Lasergrbl User Manual
10 pages
Logcat 1700525828986
No ratings yet
Logcat 1700525828986
25 pages
Manual Wifi Bluetooth Bateria Weco 5k3 XP
No ratings yet
Manual Wifi Bluetooth Bateria Weco 5k3 XP
31 pages
Upgrading Autonomous Cisco Aironet Access Points To Lightweight Mode - Cisco Sys
No ratings yet
Upgrading Autonomous Cisco Aironet Access Points To Lightweight Mode - Cisco Sys
2 pages
AMI AptioUtilities Datasheet PUB Q1-2013
No ratings yet
AMI AptioUtilities Datasheet PUB Q1-2013
1 page
Client Scripts
100% (1)
Client Scripts
30 pages
Ecos
No ratings yet
Ecos
16 pages
CEG 2136 - Fall 2008 - Final PDF
No ratings yet
CEG 2136 - Fall 2008 - Final PDF
9 pages
React - Concept and Implementation
No ratings yet
React - Concept and Implementation
55 pages
Unit 1
No ratings yet
Unit 1
32 pages
Nextcloud Manual
No ratings yet
Nextcloud Manual
103 pages

Assignment 04 (2)

Uploaded by

Assignment 04 (2)

Uploaded by

Name: Dilawar Ali

Roll No: 107278

void fill_random(double *matrix, int size) {

for (int i = 0; i < size; i++) {

matrix[i] = (double)rand() / RAND_MAX;

void print_submatrix(double *matrix, int n) {

for (int i = 0; i < 5; i++) {

for (int j = 0; j < 5; j++) {

printf("%.2f ", matrix[i*n + j]);

printf("Usage: %s <matrix_size> <num_processes>\n", argv[0]);

int num_procs = atoi(argv[2]);

int rank, size;

if (rank == 0) printf("Error: Requested %d processes but got %d\n", num_procs, size);

if (rank == 0) printf("Error: Matrix size must be divisible by number of processes\n");

double *local_A = malloc(n * n / size * sizeof(double));

double *local_C = malloc(n * n / size * sizeof(double));

MPI_Scatter(A, n*n/size, MPI_DOUBLE, local_A, n*n/size, MPI_DOUBLE, 0,

MPI_Bcast(B, n*n, MPI_DOUBLE, 0, MPI_COMM_WORLD);

int rows_per_proc = n / size;

for (int i = 0; i < rows_per_proc; i++) {

for (int j = 0; j < n; j++) {

for (int k = 0; k < n; k++) {

local_C[i*n + j] += local_A[i*n + k] * B[k*n + j];

printf("Result (top 5x5):\n");

free(local_A); free(local_C); free(B);

2. Using pThreads (Posix Threads) for Shared Memory systems.

#define MAX_THREADS 1024

double *A, *B, *C;

void fill_random(double *matrix, int size) {

for (int i = 0; i < size; i++) {

matrix[i] = (double)rand() / RAND_MAX;

void print_submatrix(double *matrix) {

for (int i = 0; i < 5; i++) {

for (int j = 0; j < 5; j++) {

printf("%.2f ", matrix[i*n + j]);

void *matmul_thread(void *arg) {

ThreadData *data = (ThreadData *)arg;

for (int i = data->start_row; i < data->end_row; i++) {

for (int j = 0; j < n; j++) {

for (int k = 0; k < n; k++) {

C[i*n + j] += A[i*n + k] * B[k*n + j];

int main(int argc, char *argv[]) {

printf("Usage: %s <matrix_size> <num_threads>\n", argv[0]);

int rows_per_thread = n / num_threads;

thread_data[i].end_row = (i+1) * rows_per_thread;

pthread_create(&threads[i], NULL, matmul_thread, &thread_data[i]);

for (int i = 0; i < num_threads; i++) {

printf("Result (top 5x5):\n");

free(A); free(B); free(C);

3. Using OpenMP for Shared Memory systems.

void fill_random(double *matrix, int size) {

#pragma omp parallel for

matrix[i] = (double)rand() / RAND_MAX;

void print_submatrix(double *matrix, int n) {

for (int i = 0; i < 5; i++) {

for (int j = 0; j < 5; j++) {

printf("%.2f ", matrix[i*n + j]);

int main(int argc, char *argv[]) {

printf("Usage: %s <matrix_size> <num_threads>\n", argv[0]);

int num_threads = atoi(argv[2]);

double *A = malloc(n * n * sizeof(double));

double *B = malloc(n * n * sizeof(double));

#pragma omp parallel for collapse(2)

for (int i = 0; i < n; i++) {

for (int j = 0; j < n; j++) {

for (int k = 0; k < n; k++) {

C[i*n + j] += A[i*n + k] * B[k*n + j];

printf("Result (top 5x5):\n");

free(A); free(B); free(C);

4. Using CUDA for GPU based systems.

__global__ void matmul_kernel(double *A, double *B, double *C, int n) {

int row = blockIdx.y * blockDim.y + threadIdx.y;

int col = blockIdx.x * blockDim.x + threadIdx.x;

if (row < n && col < n) {

for (int k = 0; k < n; k++) {

sum += A[row*n + k] * B[k*n + col];

double local_A = malloc(n n / size * sizeof(double));

double local_C = malloc(n n / size * sizeof(double));

MPI_Scatter(A, nn/size, MPI_DOUBLE, local_A, nn/size, MPI_DOUBLE, 0,

local_C[in + j] += local_A[in + k] * B[k*n + j];

double A, B, *C;

void matmul_thread(void arg) {

ThreadData data = (ThreadData )arg;

C[in + j] += A[in + k] * B[k*n + j];

double A = malloc(n n * sizeof(double));

double B = malloc(n n * sizeof(double));

C[in + j] += A[in + k] * B[k*n + j];

global void matmul_kernel(double A, double B, double *C, int n) {

sum += A[rown + k] B[k*n + col];

double A = (double)malloc(n * n * sizeof(double));

double B = (double)malloc(n * n * sizeof(double));

double C = (double)malloc(n * n * sizeof(double));

double d_A, d_B, *d_C;

cudaMemcpy(d_A, A, nnsizeof(double), cudaMemcpyHostToDevice);

cudaMemcpy(d_B, B, nnsizeof(double), cudaMemcpyHostToDevice);

cudaMemcpy(C, d_C, nnsizeof(double), cudaMemcpyDeviceToHost);