Parallel & Distributed Computing: MPI - Message Passing Interface
Parallel & Distributed Computing: MPI - Message Passing Interface
• MPI_COMM_WORLD is a communicator.
• Communicators are used to separate the communication of different modules of the
application. Communicators are essential for writing reusable libraries.
Point-to-Point Messages
• Consider a single ping-pong match in which there are two players. Each player can be
modeled by FD-DEVS (Finite Deterministic-Discrete Event System Specifications) such
that the player model has an input event ?receive and an output event !send, and it
has two states: Send and Wait. Once the player gets into “Send”, it will generate
“!send” and go back to “Wait” after the sending time which is 0.1 time unit. When
staying at “Wait” and if it gets “?receive”, it changes into “Send” again. In other
words, the player model stays at “Wait” forever unless it gets “?receive”.
• To make a complete ping-pong match, one player starts as an offender whose initial
state is “Send” and the other starts as a defender whose initial state is “Wait”. Thus
in above Fig. Player A is the initial offender and Player B is the initial defender. In
addition, to make the game continue, each player’s “?send” event should be coupled
to the other player’s “?receive” as shown in Fig. above
Ping pong 0 sent and incremented ping_pong_count 1 to 1
int ping_pong_count = 0; 0 received ping_pong_count 2 from 1
int partner_rank = (world_rank + 1) % 2; 0 sent and incremented ping_pong_count 3 to 1
while (ping_pong_count < PING_PONG_LIMIT) { 0 received ping_pong_count 4 from 1
if (world_rank == ping_pong_count % 2) { 0 sent and incremented ping_pong_count 5 to 1
// the ping pong count before you send it 0 received ping_pong_count 6 from 1
ping_pong_count++; 0 sent and incremented ping_pong_count 7 to 1
MPI_Send(&ping_pong_count, 1, MPI_INT, partner_rank, 0, 0 received ping_pong_count 8 from 1
MPI_COMM_WORLD); 0 sent and incremented ping_pong_count 9 to 1
printf("%d sent and incremented ping_pong_count %d to 0 received ping_pong_count 10 from 1
%d\n", world_rank, ping_pong_count, partner_rank); 1 received ping_pong_count 1 from 0
} 1 sent and incremented ping_pong_count 2 to 0
else { 1 received ping_pong_count 3 from 0
MPI_Recv(&ping_pong_count, 1, MPI_INT, partner_rank, 0, 1 sent and incremented ping_pong_count 4 to 0
MPI_COMM_WORLD, MPI_STATUS_IGNORE); 1 received ping_pong_count 5 from 0
printf("%d received ping_pong_count %d from %d\n", 1 sent and incremented ping_pong_count 6 to 0
world_rank, ping_pong_count, partner_rank); 1 received ping_pong_count 7 from 0
} 1 sent and incremented ping_pong_count 8 to 0
} 1 received ping_pong_count 9 from 0
1 sent and incremented ping_pong_count 10 to 0
Ring P0 P1 Pn
Ring Topology
int token;
if (world_rank != 0) {
MPI_Recv(&token, 1, MPI_INT, world_rank - 1, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
printf("Process %d received token %d from process %d\n", world_rank, token, world_rank - 1);
}
else {
// Set the token's value if you are process 0
token = -1;
}
MPI_Send(&token, 1, MPI_INT, (world_rank + 1) % world_size, 0, MPI_COMM_WORLD);
// Now process 0 can receive from the last process.
if (world_rank == 0) {
MPI_Recv(&token, 1, MPI_INT, world_size - 1, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
printf("Process %d received token %d from process %d\n", world_rank, token, world_size - 1);
}
Process 1 received token -1 from process 0
Process 2 received token -1 from process 1
Process 3 received token -1 from process 2
Process 4 received token -1 from process 3
Process 0 received token -1 from process 4
C program for π – Serial Version
#include <stdio.h>
Integration: #define N 1000000
int main(void)
{
Discretization: long long i;
double step, x, sum=0.0, pi;
step = 1.0/N;
∆ = 1/N
xi = (i + 0.5)∆ where i = 0,…,N-1.
for (i = 0; i < N; i++) {
x = (i + 0.5)*step;
sum += 4.0 / (1.0 + x*x);
}
PI = 3.14159265358997080000 pi = sum*step;
printf(“PI = %f\n”, pi);
}
C program for π – Shared Memory Version
#include <stdio.h>
#include <omp.h>
#define N 1000000000
int main(void) {
long long i;
double step, x, sum = 0.0, pi;
step = 1.0/(double) N;
#pragma omp parallel private(i) shared(sum, step)
#pragma omp parllel for
{
nth = omp_get_num_threads();
PI = 3.14159269877025560000
double mysum = 0.0;
for (i = 0; i < N; i += nth) {
x = ((double) i + 0.5)*step;
mysum += 4.0 / (1.0 + x*x);
}
sum += mysum;
}
pi = sum*step;
printf("PI = %1.20f\n", pi);
return 0;
}
C program for π – Distributed Memory Version
#include <mpi.h>
#include <math.h> P0
int main(int argc, char ** argv)
{
int N, myid, numprocs, i, rc;
double PI25DT = 3.141592653589793238462643;
double mypi, pi, setp, sum, x, a; P1 P2 Pn
MPI_Init(&argc,&argv);
MPI_Comm_size(MPI_COMM_WORLD,&numprocs);
MPI_Comm_rank(MPI_COMM_WORLD,&myid);
while (1) {
if (myid == 0) {
printf("Enter the number of intervals: (0 quits) ");
scanf("%d",&N);
} // if
C program for π - Distributed Version
MPI_Bcast(&N, 1, MPI_INT, 0, MPI_COMM_WORLD);
if (N == 0) break; // leave the while loop
step = 1.0 / (double) N;
sum = 0.0;
for (i = myid + 1; i <= N; i += numprocs) {
x = step * ((double)i - 0.5);
sum += 4.0 / (1.0 + x*x);
}
mypi = step * sum;
MPI_Reduce(&mypi, &pi, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
if (myid == 0)
printf("pi is approximately %.16f, Error is %.16f\n", pi, fabs(pi - PI25DT));
} // while
MPI_Finalize();
}
C program for pi
$ mpirun -n 2 MPIpi
9999
Enter the number of intervals: (0 quits) pi is approximately 3.1415926544232997, Error is 0.0000000008335066
999999
Enter the number of intervals: (0 quits) pi is approximately 3.1415926535898357, Error is 0.0000000000000426
99999999
Enter the number of intervals: (0 quits) pi is approximately 3.1415926535900205, Error is 0.0000000000002274
999999
Enter the number of intervals: (0 quits) pi is approximately 3.1415926535898357, Error is 0.0000000000000426
0
Enter the number of intervals: (0 quits)
Distributed Bubble Sort
P0 P1
9 8 7 6 5 4 3 2 1 0 Bubble 5 3 2 1 6 0 4 7 8 9
P0 P1
Exchg. 5 3 2 1 0 6 4 7 8 9
Split 9 8 7 6 5 4 3 2 1 0
Bubble 3 2 1 0 5 4 6 7 8 9
Bubble 8 7 6 5 9 3 2 1 0 4
Exchg. 3 2 1 0 4 5 6 7 8 9
Exchg. 8 7 6 5 3 9 2 1 0 4
Bubble 2 1 0 3 4 5 6 7 8 9
Bubble 7 6 5 3 8 2 1 0 4 9
Exchg. 1 0 2 3 4 5 6 7 8 9
Exchg. 7 6 5 3 2 8 1 0 4 9
Bubble 0 1 2 3 4 5 6 7 8 9
Bubble 6 5 3 2 7 1 0 4 8 9
Exchg. 0 1 2 3 4 5 6 7 8 9
Exchg. 6 5 3 2 1 7 0 4 8 9
Gather 0 1 2 3 4 5 6 7 8 9
Sorting Algorithms on Linear Array P0 P1 Pn
➢Assume Processors are connected as a Linear Array Linear Array
Odd 9 7 8 5 6 4 2 3 0 1 Even 0 2 1 5 7 3 9 4 8 6
Even 7 9 5 8 6 2 4 0 3 1 Exchg. 0 2 1 5 3 7 9 4 8 6
Exchg. 7 9 5 8 2 6 4 0 3 1 Odd 0 1 2 3 5 7 4 9 6 8
Odd. 7 5 9 2 8 6 0 4 1 3 Even 0 1 2 3 5 4 7 6 9 8
Even 5 7 2 9 8 0 6 1 4 3 Exchg. 0 1 2 3 4 5 7 6 9 8
Exchg. 5 7 2 9 0 8 6 1 4 3
Odd 0 1 2 3 4 5 6 7 8 9
Odd 5 2 7 0 9 8 1 6 3 4 Even 0 1 2 3 4 5 6 7 8 9
Even 2 5 0 7 9 1 8 3 6 4 Exchg. 0 1 2 3 4 5 6 7 8 9
Exchg. 2 5 0 7 1 9 8 3 6 4
Gather 0 1 2 3 4 5 6 7 8 9
Sorting Algorithms on Linear Array
➢Assume Processors are connected as a Linear Array P0 P1 Pn
Algorithm: (All processor execute the algorithm) Linear Array
1. Rank 0 processor, sends a distinct portion of array to be sorted i.e. a subarray to every processor
2. all processors receive the subarray from processor rank 0
3. isSorted <- false;
4. while (!isSorted)
4.1 isSorted <- true
4.2 apply the odd transposition step on the subarray
4.3 apply the even transposition step on the subarray
4.4 every processor except processor with highest rank, sends its highest index element to
processor on its right which sets isSorted<-false if it replaces its low index element with the
received element
4.5 every processor except processor with rank 0, sends its lowest index element to processor on
its left which sets isSorted<-false if it replaces its high index element with the received
element.
5. Rank 0 processors gathers subarray from all the processor
Odd-Even Sort - 1
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
#include <time.h>
#define N 24000
#define R 331
#define true 1
#define false 0
void PrintArray(int * a, int Asize);
int main(int argc, char *argv[]) {
int i, j, k, t, NumProcs, MyRank, ChunkSize, isSorted = false;
MPI_Status Status;
int a[N], b[N];
MPI_Init(NULL,NULL);
MPI_Comm_size(MPI_COMM_WORLD, &NumProcs);
MPI_Comm_rank(MPI_COMM_WORLD, &MyRank);
// Calculate the worker subarray size
ChunkSize = N / NumProcs;
Odd-Even Sort - 2
// Generate test Data
if (MyRank == 0) {
srand(time(NULL));
for (i = 0; i < N; i++)
b[i] = rand() % R; }
// Distribute Data among workers
//MPI_Scatter(&b, ChunkSize, MPI_INT, &a, ChunkSize, MPI_INT, 0, MPI_COMM_WORLD);
if (MyRank == 0)
for (i = 1; i < NumProcs; i++)
MPI_Send(&b[i*ChunkSize], ChunkSize, MPI_INT, i, 0, MPI_COMM_WORLD);
else
MPI_Recv(a, ChunkSize, MPI_INT, 0, 0, MPI_COMM_WORLD);
//
k = 0;
while (!isSorted) {
isSorted = true; k++;
// Odd Transposition
for (i = 1; i <= ChunkSize - 2 ; i += 2)
if (a[i] > a[i+1]) { t = a[i]; a[i] = a[i+1]; a[i+1] = t; isSorted = false;}
//Even Transposition
for (i = 0; i <= ChunkSize - 2; i += 2)
if (a[i] > a[i+1]) { t = a[i]; a[i] = a[i+1]; a[i+1] = t; isSorted = false;}// Exchange
Odd-Even Sort - 3
if (MyRank < NumProcs-1) {
// MPI_Sendrecv(&a[ChunkSize-1], 1, MPI_INT, MyRank+1, 0, &t, 1, MPI_INT, MyRank+1, 0, MPI_COMM_WORLD, &Status);
MPI_Send(&a[ChunkSize-1], 1, MPI_INT, MyRank+1, 0, MPI_COMM_WORLD);
MPI_Recv(&t, 1, MPI_INT, MyRank+1, 0, MPI_COMM_WORLD, &Status);
if (a[ChunkSize-1] > t) {a[ChunkSize-1] = t; isSorted = false;}
}
if (MyRank > 0) {
//MPI_Sendrecv(&a[0], 1, MPI_INT, MyRank-1, 0, &t, 1, MPI_INT, MyRank-1, 0, MPI_COMM_WORLD, &Status);
MPI_Recv(&t, 1 , MPI_INT, MyRank-1, 0, MPI_COMM_WORLD, &Status);
MPI_Send(&a[0], 1, MPI_INT, MyRank-1, 0, MPI_COMM_WORLD);
if (t > a[0]) {isSorted = false; a[0] = t;}
}
MPI_Allreduce(&isSorted, &isSorted, 1, MPI_INT, MPI_LAND, MPI_COMM_WORLD);
}
// Gather Data from workers
MPI_Gather(&a, ChunkSize, MPI_INT, &b, ChunkSize, MPI_INT, 0, MPI_COMM_WORLD);
if (MyRank == 0) {
printf("Array Size: %d elements. No of iterations to sort the array : %d\n", N, k);
//PrintArray(b, N);
printf("\n");
}
MPI_Finalize(); } // main
Sorting Algorithms on Linear Array P0 P1 Pn
// Serial Version // Shared Mem. - OpenMP Version // Dist. Memory - MPI Version
for (i = 0; i < m; i++) #pragma omp parallel for nrows = m / nprocs;
for(j = 0, y[i]=0; j < n; j++) for (i = 0; i < m; i++) for (i = nrows*rank; i < (nrows+1)*rank; i++)
y(i) += a[i][j] *x[j] for(j = 0, y[i]=0; j < n; j++) for(j = 0,y[i] = 0; j < n; j++)
y(i) += a[i][j] * x[j] y(i) += a[i][j] * x[j]
Parallel Computation of DFT using FFT
• The discrete Fourier transform (DFT) is a very important algorithm that
finds its use in many applications such as telecommunications, speech
processing, image processing, medical imaging such as in computer
assisted tomography (CAT), radar (synthetic aperture radar), sonar, and
antenna array (phased arrays).
• The DFT algorithm finds the spectrum of a periodic discrete - time signal
with period N . The spectral component X(k) is obtained by the following
equation, where WN is the twiddle factor, which equals the Nth root of
unity.
a00 a01 ⋯ a0,n-1 b00 b01 ⋯ b0,n-1 C0,0 = ∑a0jbj0 ⋯ C0,n-1 = ∑a0jbjn-1
a10 a11 ⋯ a1,n-1 b10 b11 ⋯ b1,n-1 ⋯ ⋯ ⋯
⋮ ⋮ ⋯ ⋮ ⋮ ⋮ ⋯ ⋮ ⋯ ⋯ ⋯
ai0 ai1 ⋯ ai,n-1 x bi0 bi1 ⋯ bi,n-1 = Ci,0 = ∑aijbj0 ⋯ Ci,i = ∑aijbji
⋮ ⋮ ⋯ ⋮ ⋮ ⋮ ⋯ ⋮ ⋯ ⋯ ⋯
⋮ ⋮ ⋯ ⋮ ⋮ ⋮ ⋯ ⋮ ⋯ ⋯ ⋯
am-1,0 am-1,1 ⋯ am-1,n-1 bn-1,0 bn-1,1 ⋯ bn-1,n-1 cm-1,0 = ∑am-1,jbj0 ⋯ cm-1,n-1 = ∑am-1,jbj,n-1
Parallel Matrix Multiplication
1. Master broadcasts Matrix B to all workers
2. Master Sends submatrix Ai having size (Arows / Nprocs) x Arows to each worker i.
3. Each worker i calculates the submatrix (Ai) with matrix B and stores it in submatrix Ci
4. Master gathers the submatrix Ci form ith worker into matrix C.
A B C
P0 P0
P1 P1
x =
P2 P2
P3 P3
Execution Results
MatxMatMPI has started with 4 tasks.
Initializing arrays...
Sending 7 rows to task 1 offset=0
Sending 7 rows to task 2 offset=7
Sending 6 rows to task 3 offset=14
Received results from task 1
Received results from task 2
Received results from task 3
Result Matrix:
1366 1429 1065 1169 1487 1541 1256 1694 1507 1169 1029 1166 1046 1219 1357 1430 1193 1374 1330 1473
1097 1634 913 1232 1338 1454 1254 1476 1355 1286 1225 1217 1234 1157 1215 1529 1264 1393 1333 1343
1647 1667 1351 1585 1897 1805 1307 1919 1550 1307 1537 1398 1520 1727 1591 1904 1716 1648 1619 1769
1513 1745 1406 1697 1573 1825 1558 1857 1416 1566 1554 1572 1629 1600 1467 1879 1706 1713 1698 1687
1567 1654 1283 1711 1833 1788 1584 1962 1612 1427 1230 1434 1331 1501 1608 1678 1507 1488 1635 1716
1307 1474 1348 1425 1249 1265 1270 1378 1224 1178 1312 1194 1280 1304 1087 1499 1303 1290 1377 1558
1375 1787 1281 1568 1525 1546 1279 1472 1305 1456 1470 1458 1640 1288 1250 1703 1635 1722 1832 1608
1232 1403 1109 1371 1352 1414 1224 1206 1117 1069 1339 1272 1294 1214 1204 1557 1484 1660 1471 1523
1025 1384 955 1462 1302 1403 1298 1153 1083 1118 1432 1243 1209 1346 1178 1342 1194 1430 1074 1109
1373 1867 1168 1607 1620 1590 1415 1599 1476 1454 1413 1321 1419 1502 1462 1744 1462 1384 1564 1598
1271 1379 1037 1313 1390 1290 1043 1231 1212 875 1046 1019 1224 1116 959 1201 1129 1112 1065 1303
1213 1616 1159 1369 1514 1589 1225 1581 1356 1572 1291 1355 1315 1221 1374 1616 1430 1410 1321 1392
1501 1610 1340 1578 1601 1571 1491 1498 1357 1364 1384 1543 1442 1411 1546 1678 1549 1670 1697 1771
1437 1672 1116 1716 1626 1599 1269 1465 1395 1125 1494 1298 1396 1512 1349 1710 1446 1707 1604 1461
1113 1521 1004 1327 1308 1272 947 1320 1225 1051 1087 1063 1198 975 923 1487 1382 1377 1474 1298
1302 1413 1076 1180 1458 1540 1207 1578 1299 1161 1409 1307 1451 1421 1287 1529 1359 1621 1351 1471
1377 1778 1322 1649 1449 1480 1380 1492 1374 1482 1253 1348 1229 1355 1284 1421 1301 1156 1346 1339
1799 1879 1561 1711 1700 1931 1531 1866 1520 1448 1593 1546 1748 1651 1490 1863 1742 1670 1626 1836
1418 1581 1150 1478 1329 1580 1350 1321 1143 1264 1339 1380 1482 1452 1298 1529 1408 1605 1459 1558
1296 1659 902 1330 1370 1507 1216 1475 1285 1288 1065 965 1323 1260 1249 1596 1227 1078 1367 1556
Matrix Multiplication
Strassen’s method
DG for an 8-point decimation-in-time FFT algorithm.
MPI_Gatherv( sendarray, 100, MPI_INT, rbuf, rcounts, displs, MPI_INT, root, comm);