SERC IntroMPI 2019-09-14 v0
SERC IntroMPI 2019-09-14 v0
Cluster - 90’s
Modern HPC Facilities
return 0;
}
A simple MPI program in C
#include <stdio>
#include <stdlib> Any Cluster
#include “mpi.h" $ mpirun -n 8 ./a.out
int main( int argc, char *argv[] ) SahasraT
{
int nproc, myrank;
$ aprun -n 8 ./a.out
MPI_Init(&argc,&argv);
MPI_Comm_size(MPI_COMM_WORLD,&nproc);
Hello from 0.
MPI_Comm_rank(MPI_COMM_WORLD,&myrank); Hello from 1.
printf(“Hello from %d. \n”,myrank)
Hello from 2.
Hello from 3.
/* Finalize */
MPI_Finalize();
Hello from 4.
return 0; Hello from 5.
} Hello from 6.
Hello from 7.
Header file
• Defines MPI-related parameters and functions
#include "mpi.h" • Must be included in all routines calling MPI functions
int main( int argc, char *argv[] )• Can also use include file:
{ include mpif.h
int nproc, myrank;
/* Initialize MPI */
MPI_Init(&argc,&argv);
/* Get the number of processes */
MPI_Comm_size(MPI_COMM_WORLD,&nproc);
/* Get my process number (rank) */
MPI_Comm_rank(MPI_COMM_WORLD,&myrank);
/* Finalize */
MPI_Finalize();
return 0;
}
Initialization
#include "mpi.h"
int main( int argc, char *argv[] )
{
int nproc, myrank; • Must be called at the beginning of the code
/* Initialize MPI */ before any other calls to MPI functions
MPI_Init(&argc,&argv);
/* Get the number of processes */ • Sets up the communication channels between the
processes and gives each one a rank.
MPI_Comm_size(MPI_COMM_WORLD,&nproc);
/* Get my process number (rank) */
MPI_Comm_rank(MPI_COMM_WORLD,&myrank);
/* Finalize */
MPI_Finalize();
return 0;
}
How many processes do we have?
• Returns the number of processes available under
MPI_COMM_WORLD
#include "mpi.h" communicator
• This
int is theint
main( number usedchar
argc, on the mpiexec)(or mpirun)
*argv[]
{ command:
int mpiexec
nproc, –n nproc a.out
myrank;
/* Initialize MPI */
MPI_Init(&argc,&argv);
/* Get the number of processes */
MPI_Comm_size(MPI_COMM_WORLD,&nproc);
/* Get my process number (rank) */
MPI_Comm_rank(MPI_COMM_WORLD,&myrank);
/* Finalize */
MPI_Finalize();
return 0;
}
What is my rank?
#include "mpi.h"
int main( int argc, char *argv[] )
{ • Get my rank among all of the nproc processes under
MPI_COMM_WORLD
int nproc, myrank;
• This
/* Initialize MPI is
*/a unique number that can be used to distinguish this
process from the others
MPI_Init(&argc,&argv);
/* Get the number of processes */
MPI_Comm_size(MPI_COMM_WORLD,&nproc);
/* Get my process number (rank) */
MPI_Comm_rank(MPI_COMM_WORLD,&myrank);
/* Finalize */
MPI_Finalize();
return 0;
}
Termination
#include "mpi.h"
int main( int argc, char *argv[] )
{
int nproc, myrank;
/* Initialize MPI */
MPI_Init(&argc,&argv);
/* Get the number of processes */
MPI_Comm_size(MPI_COMM_WORLD,&nproc);
/* Get my process number (rank) */
MPI_Comm_rank(MPI_COMM_WORLD,&myrank);
! Initialize MPI
call MPI_Init(ierr)
! Finalize
call MPI_Finalize(ierr)
#include “mpi.h"
int main( int argc, char *argv[] ){ $ mpirun -n 8 ./a.out #include “mpi.h"
int main( int argc, char *argv[] ){
int nproc, myrank;
MPI_Init(&argc,&argv); SahasraT int nproc, myrank;
MPI_Init(&argc,&argv);
MPI_Comm_size(MPI_COMM_WORLD,&nproc);
MPI_Comm_size(MPI_COMM_WORLD,&nproc);
MPI_Comm_rank(MPI_COMM_WORLD,&myrank);
printf(“Hello from %d. \n”,myrank) $ aprun -n 8 ./a.out MPI_Comm_rank(MPI_COMM_WORLD,&myrank);
printf(“Hello from %d. \n”,myrank)
MPI_Finalize();
return 0; MPI_Finalize();
} return 0;
}
Hello from 0.
#include “mpi.h" #include “mpi.h"
int main( int argc, char *argv[] ){ Hello from 4. int main( int argc, char *argv[] ){
int nproc, myrank;
MPI_Init(&argc,&argv);
MPI_Comm_size(MPI_COMM_WORLD,&nproc);
Hello from 2. int nproc, myrank;
MPI_Init(&argc,&argv);
MPI_Comm_size(MPI_COMM_WORLD,&nproc);
MPI_Comm_rank(MPI_COMM_WORLD,&myrank);
printf(“Hello from %d. \n”,myrank)
MPI_Finalize();
Hello from 1. MPI_Comm_rank(MPI_COMM_WORLD,&myrank);
printf(“Hello from %d. \n”,myrank)
MPI_Finalize();
return 0;
} Hello from 3. }
return 0;
#include “mpi.h"
Hello from 6.
#include “mpi.h"
int main( int argc, char *argv[] ){
MPI_COMM_WORLD We don’t have any subsets yet. We just choose the “default”
Point to point: 2 processes at a time
MPI_Recv(recvbuf,count,datatype,source,tag,comm,status)
MPI_Sendrecv(sendbuf,sendcount,sendtype,dest,sendtag,
recvbuf,recvcount,recvtype,source,recvtag,comm,status)
Datatypes are:
FORTRAN: MPI_INTEGER, MPI_REAL, MPI_DOUBLE_PRECISION,
MPI_COMPLEX,MPI_CHARACTER, MPI_LOGICAL, etc…
P0 A B C D P0 A B C D
P1 Broadcast P1 A B C D
P2 P2 A B C D
P3 P3 A B C D
• One process (called “root”) sends data to all the other processes in the same
communicator
• Must be called by ALL processes with the same arguments
Collective communication:
Gather
MPI_Gather(sendbuf,sendcount,sendtype,recvbuf,recvcount,
recvtype,root,comm,ierr)
P0 A P0 A B C D
P1 B Gather P1
P2 C P2
P3 D P3
• One root process collects data from all the other processes in the same communicator
• Must be called by all the processes in the communicator with the same arguments
• “sendcount” is the number of basic datatypes sent, not received (example above would
be sendcount = 1)
• Make sure that you have enough space in your receiving buffer!
Collective communication:
Gather to All
MPI_Allgather(sendbuf,sendcount,sendtype,recvbuf,recvcount,
recvtype,comm,info)
P0 A P0 A B C D
P1 B Allgather P1 A B C D
P2 C P2 A B C D
P3 D P3 A B C D
• All processes within a communicator collect data from each other and end up with the
same information
• Must be called by all the processes in the communicator with the same arguments
• Again, sendcount is the number of elements sent
Collective communication:
Reduction
MPI_Reduce(sendbuf,recvbuf,count,datatype,op,root,comm,ierr)
P0 A P0 A+B+C+D
P1 B Reduce (+) P1
P2 C P2
P3 D P3
• One root process collects data from all the other processes in the same communicator and
performs an operation on the received data
• Called by all the processes with the same arguments
• Operations are: MPI_SUM, MPI_MIN, MPI_MAX, MPI_PROD, logical AND, OR,
XOR, and a few more
• User can define own operation with MPI_Op_create()
Collective communication:
Reduction to All
MPI_Allreduce(sendbuf,recvbuf,count,datatype,op,comm,ierr)
P0 A P0 A+B+C+D
P1 B Allreduce (+) P1 A+B+C+D
P2 C P2 A+B+C+D
P3 D P3 A+B+C+D
• All processes within a communicator collect data from all the other processes and
performs an operation on the received data
• Called by all the processes with the same arguments
• Operations are the same as for MPI_Reduce
More MPI collective calls
One “root” process send a different piece of the data to each one of the other
Processes (inverse of gather)
MPI_Scatter(sendbuf,sendcnt,sendtype,recvbuf,recvcnt,
recvtype,root,comm,ierr)
starttime=MPI_WTIME()
… program body …
endtime=MPI_WTIME()
elapsetime=endtime-starttime
Blocking communications
• The call waits until the data transfer is
done
– The sending process waits until all
data are transferred to the system
buffer (differences for eager vs
rendezvous protocols...)
– The receiving process waits until all
data are transferred from the system
buffer to the receive buffer
• All collective communications are
blocking
Non-blocking
• Returns immediately after the
data transferred is initiated
• Allows to overlap computation
with communication
• Need to be careful though
– When send and receive buffers
are updated before the transfer
is over, the result will be wrong
Debugging tips
Use “unbuffered” writes to do “printf-debugging” and always write out the process
id:
C: fprintf(stderr,”%d: …”,myid,…);
Fortran: write(0,*)myid,’: …’
If the code detects an error and needs to terminate, use MPI_ABORT. The
errorcode is returned to the calling environment so it can be any number.
C: MPI_Abort(MPI_Comm comm, int errorcode);
Fortran: call MPI_ABORT(comm, errorcode, ierr)
ifp = fopen("ex4.in","r");
C version
fscanf(ifp,"%d",&n);
fclose(ifp);
printf("number of intervals = %d\n",n);
h = 1.0 / (double) n;
sum = 0.0;
for (i = 1; i <= n; i++) {
x = h * ((double)i - 0.5);
sum += (4.0 / (1.0 + x*x));
}
mypi = h * sum;
pi = mypi;
printf("pi is approximately %.16f, Error is %.16f\n",
pi, fabs(pi - PI25DT));
return 0;
}
#include "mpi.h"
#include <stdio.h>
#include <math.h>
int main( int argc, char *argv[] )
{
int n, myid, numprocs, i, j, tag, my_n;
double PI25DT = 3.141592653589793238462643; Root reads input
double mypi,pi,h,sum,x,pi_frac,tt0,tt1,ttf;
FILE *ifp;
MPI_Status Stat; and broadcast to
MPI_Request request;
n = 1; all
tag = 1;
MPI_Init(&argc,&argv);
MPI_Comm_size(MPI_COMM_WORLD,&numprocs);
MPI_Comm_rank(MPI_COMM_WORLD,&myid);
tt0 = MPI_Wtime();
if (myid == 0) {
ifp = fopen("ex4.in","r");
fscanf(ifp,"%d",&n);
fclose(ifp);
//printf("number of intervals = %d\n",n);
}
/* Global communication. Process 0 "broadcasts" n to all other processes */
MPI_Bcast(&n, 1, MPI_INT, 0, MPI_COMM_WORLD);
Each process calculates its section of the integral and adds up
results with MPI_Reduce
…
h = 1.0 / (double) n;
sum = 0.0;
for (i = myid*n/numprocs+1; i <= (myid+1)*n/numprocs; i++) {
x = h * ((double)i - 0.5);
sum += (4.0 / (1.0 + x*x));
}
mypi = h * sum;
ttf = MPI_Wtime();
printf("myid=%d pi is approximately %.16f, Error is %.16f time = %10f\n",
myid, pi, fabs(pi - PI25DT), (ttf-tt0));
MPI_Finalize();
return 0;
}
Thank you...
Non-blocking send and receive
Point to point:
MPI_Isend(buf,count,datatype,dest,tag,comm,request,ierr)
MPI_Irecv(buf,count,datatype,source,tag,comm,request,ierr)
The functions MPI_Wait and MPI_Test are used to complete a nonblocking communication
MPI_Wait(request,status,ierr)
MPI_Test(request,flag,status,ierr)
MPI_Wait returns when the operation identified by “request” is complete. This is a non-local operation.
MPI_Test returns “flag = true” if the operation identified by “request” is complete. Otherwise it returns
“flag = false”. This is a local operation.