Openacc Online Course: Lecture 1: Introduction To Openacc
Openacc Online Course: Lecture 1: Introduction To Openacc
int main(){
<sequential code>
Incremental
Compiler
#pragma acc kernels Directive
Single Source
{ Low Learning
<parallel code> Curve
}
}
More on this later!
SINGLE CODE FOR MULTIPLE PLATFORMS
OpenACC - Performance Portable Programming Model for HPC
AWE Hydrodynamics CloverLeaf mini-App, bm32 data set
80x 77x
PGI OpenACC
PEZY-SC
0x Dual Haswell 1 Tesla 1 Tesla
Dual Broadwell Dual POWER8
P100 V100
Systems: Haswell: 2x16 core Haswell server, four K80s, CentOS 7.2 (perf-hsw10), Broadwell: 2x20 core Broadwell server, eight P100s (dgx1-prd-01), Minsky: POWER8+NVLINK, four P100s,
RHEL 7.3 (gsn1).
Compilers: Intel 17.0, IBM XL 13.1.3, PGI 16.10, KNL: Compiler version: 17.0.1 20161005,
Benchmark: CloverLeaf v1.3 downloaded from http://uk-mac.github.io/CloverLeaf the week of November 7 2016; CloverlLeaf_Serial; CloverLeaf_ref (MPI+OpenMP); CloverLeaf_OpenACC
(MPI+OpenACC)
Data compiled by PGI November 2016, Volta data collected June 2017
TOP HPC APPS ADOPTING OPENACC
ANSYS Fluent ● Gaussian ● VASP ● GTC ● XGC ● ACME ● FLASH ● LSDalton ● COSMO ● ELEPHANT ● RAMSES ● ICON ● ORB5
15000
7500
0
T4 T8 T14 T28
CPU
(cores)
Hardware: HPE server with dual Intel Xeon E5-2698 v3 CPUs (2.30GHz ; 16 cores/chip),
256GB memory and 4 Tesla K80 dual GPU boards (boost clocks: MEM 2505 and SM 875).
CPU: (Haswell EP) Intel(R) Xeon(R) CPU E5-2695 v3 @2.30GHz, 2 sockets, 28 cores Gaussian source code compiled with PGI Accelerator Compilers (16.5) with OpenACC (2.5
GPU: Tesla K80 12+12 GB, Driver 346.46 standard).
FAMILIAR TO OPENMP PROGRAMMERS
CPU CPU Parallel Hardware
main() {
double pi = 0.0; long i; main() {
double pi = 0.0; long i;
Applications
Compiler Programming
Libraries
Directives Languages
• Single source. No GPU-specific code. Compile the same program for accelerators or
serial.
• Incremental. Developers can port and tune parts of their application as resources and
profiling dictates. No wholesale rewrite required. Which can be quick.
TRUE OPEN STANDARD
• Full OpenACC 1.0 and 2.0 and now 2.5
specifications available at OpenACC.org Members
http://www.openacc.org
MAESTRO
NekCEM CASTRO CloverLeaf FINE/Turbo
CFD
Comp Electromagnetics Astrophysics Comp Hydrodynamics NUMECA
Argonne National Lab Stony Brook University AWE International
2.5X speedup 4.4X speedup 4X speedup 10X faster routines
60% less energy 4 weeks effort Single CPU/GPU code 2X faster app
OPENACC DIRECTIVES
A SIMPLE EXAMPLE: SAXPY
SAXPY in C SAXPY in Fortran
... ...
// Somewhere in main $ From main program
// call SAXPY on 1M elements $ call SAXPY on 1M elements
saxpy(1<<20, 2.0, x, y); call saxpy(2**20, 2.0, x_d, y_d)
...
...
KERNELS: OUR FIRST OPENACC DIRECTIVE
We request that each loop execute as a separate kernel on the GPU. This is an
incredibly powerful directive.
!$acc kernels
do i=1,n
a(i) = 0.0
b(i) = 1.0 Kernel:
kernel 1 A parallel routine to run
c(i) = 2.0
end do on the parallel
hardware
do i=1,n
a(i) = b(i) + c(i)
end do kernel 2
!$acc end kernels
GENERAL DIRECTIVE SYNTAX AND SCOPE
C Fortran
I may indent the directives at the natural code indentation level for readability. It is a
common practice to always start them in the first column (ala #define/#ifdef). Either is
fine with C or Fortran 90 compilers.
COMPLETE SAXPY EXAMPLE CODE
int main(int argc, char **argv) “I promise y is not
{ aliased by
int N = 1<<20; // 1 million floats Anything else (esp. x)”
#include <stdlib.h>
if (argc > 1)
N = atoi(argv[1]);
void saxpy(int n,
float a,
float *x = (float*)malloc(N * sizeof(float));
float *x,
float *y = (float*)malloc(N * sizeof(float));
for (int i = 0; i < N; ++i) { float *restrict y)
x[i] = 2.0f; {
y[i] = 1.0f; #pragma acc kernels
} for (int i = 0; i < n; ++i)
saxpy(N, 3.0f, x, y); y[i] = a * x[i] + y[i];
}
return 0;
}
C DETAIL: THE “RESTRICT” KEYWORD
• Standard C (as of C99).
• Important for optimization of serial as well as OpenACC and OpenMP code.
• Promise given by the programmer to the compiler for a pointer: float *restrict ptr
Meaning: “for the lifetime of ptr, only it or a value directly derived from it (such as ptr + 1) will be used to access the object to
which it points”
• Run: a.out
Compiler Output
pgcc -acc -Minfo=accel -ta=tesla saxpy.c -ta=tesla will only target a GPU
-ta=multicore will only target a multicore CPU
saxpy: -Minfo=accel turns on helpful compiler reporting
8, Generating copyin(x[:n-1])
Generating copy(y[:n-1])
Generating compute capability 1.0 binary
Generating compute capability 2.0 binary
9, Loop is parallelizable
Accelerator kernel generated
9, #pragma acc loop worker, vector(256) /* blockIdx.x threadIdx.x */
CC 1.0 : 4 registers; 52 shared, 4 constant, 0 local memory bytes; 100% occupancy
CC 2.0 : 8 registers; 4 shared, 64 constant, 0 local memory bytes; 100% occupancy
COMPARE: OPENACC AND CUDA IMPLEMENTATIONS
OpenACC: CUDA:
Complete SAXPY Example Code Partial CUDA C SAXPY Code SAXPY Example Code
#include <stdlib.h>
__global__ void saxpy_kernel( float a, module kmod
void saxpy(int n, float* x, float* y, int n ){ use cudafor
float a, int i; contains
attributes(global) subroutine
float *x, i = blockIdx.x*blockDim.x +
saxpy_kernel(A,X,Y,N)
float *restrict y) threadIdx.x; real(4), device :: A, X(N), Y(N)
{ if( i <= n ) x[i] = a*x[i] + y[i]; integer, value :: N
#pragma acc kernels } integer :: i
for (int i = 0; i < n; ++i) void saxpy( float a, float* x, float* y, i = (blockidx%x-1)*blockdim%x + threadidx%x
y[i] = a * x[i] + y[i]; int n ){ if( i <= N ) X(i) = A*X(i) + Y(i)
} float *xd, *yd; end subroutine
cudaMalloc( (void**)&xd, end module
int main(int argc, char **argv) n*sizeof(float) );
{ cudaMalloc( (void**)&yd, subroutine saxpy( A, X, Y, N )
int N = 1<<20; // 1 million floats n*sizeof(float) ); cudaMemcpy( xd, x, use kmod
if (argc > 1) n*sizeof(float), real(4) :: A, X(N), Y(N)
N = atoi(argv[1]); integer :: N
cudaMemcpyHostToDevice ); real(4), device, allocatable, dimension(:)::
float *x = (float*)malloc(N * sizeof(float)); cudaMemcpy( yd, y, n*sizeof(float), &
Xd, Yd
float *y = (float*)malloc(N * sizeof(float));
allocate( Xd(N), Yd(N) )
for (int i = 0; i < N; ++i) { cudaMemcpyHostToDevice );
Xd = X(1:N)
x[i] = 2.0f; saxpy_kernel<<< (n+31)/32, 32 >>>( a, Yd = Y(1:N)
y[i] = 1.0f; xd, yd, n ); call saxpy_kernel<<<(N+31)/32,32>>>(A, Xd,
} cudaMemcpy( x, xd, n*sizeof(float), Yd, N)
saxpy(N, 3.0f, x, y); X(1:N) = Xd
cudaMemcpyDeviceToHost ); deallocate( Xd, Yd )
cudaFree( xd ); cudaFree( yd ); end subroutine
return 0;
} }
BIG DIFFERENCE!
OpenACC vs CUDA implementations
for(index=0, index<1000000,index++)
Array[index] = 4 * Array[index];
for(index=0, index<1000000,index++)
Array[index] = 4 * Array[index];
….
Array[index] = 4*Array[index]; Array[index] = 4*Array[index]; Array[index] = 4*Array[index];
Processor 4 Processor 5
for(index=3000, index<3999,index++) for(index=4000, index<4999,index++)
Array[index] = 4*Array[index]; Array[index] = 4*Array[index];
WITH DATA DEPENDENCIES
But what if the loops are not entirely independent?
Take, for example, a similar loop like this:
for(index=1, index<1000000,index++)
Array[index] = 4 * Array[index] – Array[index-1];
Processor 1
for(index=0, index<999,index++)
Array[index] = 4*Array[index]-
Array[index-1];
Processor 2
for(index=1000, index<1999,index++)
Array[index] = 4*Array[index]-
….
Array[index-1];
As large, complex loops are quite common in HPC, especially around the most
important parts of your code, the compiler will often balk most when you most need
a kernel to be generated. What can you do?
HOW TO MANAGE DATA DEPENDENCIES
• Rearrange your code to make it more obvious to the compiler that there is not
really a data dependency.
• Eliminate a real dependency by changing your code.
• There is a common bag of tricks developed for this as this issue goes back 40
years in HPC. Many are quite trivial to apply.
• The compilers have gradually been learning these themselves.
• Override the compiler’s judgment (independent clause) at the risk of invalid
results. Misuse of restrict has similar consequences.
EXERCISES
FOUNDATION EXERCISE: LAPLACE SOLVER
• I’ve been using this for MPI, OpenMP and now OpenACC. It is a great simulation
problem, not rigged for OpenACC.
• In this most basic form, it solves the Laplace equation: 𝛁𝟐 𝒇(𝒙, 𝒚) = 𝟎
• The Laplace Equation applies to many physical problems, including: electrostatics,
fluid flow, and temperature
• For temperature, it is the Steady State Heat Equation:
Initial Conditions Final Steady State
Metal Metal
Plate Plate
Heating
Element
EXERCISE FOUNDATION: JACOBI ITERATION
• The Laplace equation on a grid states that each grid point is the average of it’s
neighbors.
• We can iteratively converge to that state by repeatedly computing new values at
each point from the average of neighboring points.
• We just keep doing this until the difference from one pass to the next is small
enough for us to tolerate.
A(i,j+1)
𝐴𝑘 (𝑖 − 1, 𝑗) + 𝐴𝑘 𝑖 + 1, 𝑗 + 𝐴𝑘 𝑖, 𝑗 − 1 + 𝐴𝑘 𝑖, 𝑗 + 1
𝐴𝑘+1 𝑖, 𝑗 =
A(i-1,j) A(i,j) A(i+1,j) 4
A(i,j-1)
SERIAL CODE IMPLEMENTATION
C
for(i = 1; i <= ROWS; i++) {
for(j = 1; j <= COLUMNS; j++) {
Temperature[i][j] = 0.25 * (Temperature_last[i+1][j] + Temperature_last[i-1][j] +
Temperature_last[i][j+1] + Temperature_last[i][j-1]);
}
}
Fortran
do j=1,columns
do i=1,rows
temperature(i,j)= 0.25 * (temperature_last(i+1,j)+temperature_last(i-1,j) + &
temperature_last(i,j+1)+temperature_last(i,j-1) )
enddo
enddo
SERIAL C CODE (KERNEL)
while ( dt > MAX_TEMP_ERROR && iteration <= max_iterations ) { Done?
for(i = 1; i <= ROWS; i++) {
for(j = 1; j <= COLUMNS; j++) {
Temperature[i][j] = 0.25 * (Temperature_last[i+1][j] + Temperature_last[i-1][j] + Calculate
Temperature_last[i][j+1] + Temperature_last[i][j-1]);
}
}
dt = 0.0;
Update
for(i = 1; i <= ROWS; i++){ temp
for(j = 1; j <= COLUMNS; j++){
dt = fmax( fabs(Temperature[i][j]-Temperature_last[i][j]), dt);
array and
Temperature_last[i][j] = Temperature[i][j]; find max
} change
}
iteration++;
}
SERIAL C CODE SUBROUTINES
void initialize(){ void track_progress(int iteration) {
WHOLE C CODE
#include <sys/time.h>
// size of plate
#define COLUMNS 1000
#define ROWS 1000
// largest permitted change in temp (This value takes about 3400 steps)
#define MAX_TEMP_ERROR 0.01
do j=1,columns
do i=1,rows
temperature(i,j)=0.25*(temperature_last(i+1,j)+temperature_last(i-1,j)+ & Calculate
temperature_last(i,j+1)+temperature_last(i,j-1) )
enddo
enddo
dt=0.0
iteration = iteration+1
enddo
SERIAL FORTRAN CODE SUBROUTINES
subroutine initialize( temperature_last ) subroutine track_progress(temperature, iteration)
implicit none implicit none
temperature_last = 0.0 print *, '---------- Iteration number: ', iteration, ' ---------------'
do i=5,0,-1
!these boundary conditions never change throughout run write (*,'("("i4,",",i4,"):",f6.2," ")',advance='no'), &
rows-i,columns-i,temperature(rows-i,columns-i)
!set left side to 0 and right to linear increase enddo
do i=0,rows+1 print *
temperature_last(i,0) = 0.0
temperature_last(i,columns+1) = (100.0/rows) * i
enddo
!Size of plate
WHOLE FORTRAN CODE
integer, parameter :: columns=1000
integer, parameter :: rows=1000
double precision, parameter :: max_temp_error=0.01
PROGRAMMING MODELS
OpenACC, CUDA Fortran, OpenMP,
C/C++/Fortran Compilers and Tools
PLATFORMS
X86, OpenPOWER, NVIDIA GPU
UPDATES 1-2 times a year 6-9 times a year 6-9 times a year
PGI Professional
SUPPORT User Forums PGI Support
Services
LICENSE Annual Perpetual Volume/Site
OPENACC RESOURCES
Guides ● Talks ● Tutorials ● Videos ● Books ● Spec ● Code Samples ● Teaching Materials ● Events ● Success Stories ● Courses ● Slack ● Stack Overflow
FREE
Compilers
https://www.openacc.org/community#slack
XSEDE MONTHLY WORKSHOP SERIES