Gpu Computing With Cuda Lecture 8 - Cuda Libraries - Cufft, Pycuda
Gpu Computing With Cuda Lecture 8 - Cuda Libraries - Cufft, Pycuda
Christopher Cooper
Boston University
August, 2011
UTFSM, Valparaíso, Chile
1
Outline of lecture
‣ Overview:
‣ Algorithm
‣ Motivation, examples
2
CUDA Libraries
‣ Fourier Transform
! ∞
û(k) = e−ikx u(x)dx Real Wave
−∞
4
Discrete Fourier Transform (DFT)
sin(x) sin(3x)
sin(5x) sin(7x)
‣ DFT
N
! −1
− 2πi
ûk = uj e N kj k = 0, 1, ..., N − 1
j=0
‣ Inverse DFT
N
! −1
2πi
uj = ûk e N kj j = 0, 1, ..., N − 1
k=0
6
Fast Fourier Transform (FFT)
- N = 104:
7
Fast Fourier Transform (FFT)
‣ Cooley-Tukey radix 2
N
! −1
− 2πi
ûk = uj e N kj
j=0
N/2−1 N/2−1
! !
− 2πi − 2πi
ûk = u2j e N k(2j) + u2j+1 e N k(2j+1)
j=0 j=0
8
Fast Fourier Transform (FFT)
‣ Cooley-Tukey radix 2
N
! −1
− 2πi
ûk = uj e N kj
j=0
N/2−1 N/2−1
! !
− 2πi − 2πi
ûk = u2j e N k(2j) + u2j+1 e N k(2j+1)
j=0 j=0
Even Odd
8
Fast Fourier Transform (FFT)
N/2−1 N/2−1
! 2πi ! 2πi
− N/2 kj − 2πi − N/2 kj
ûk = u2j e +e N k u2j+1 e
j=0 j=0
‣ By doing this recursively until there is no sum, you get log(N) levels
‣ 4 point transform
− 2π − 2π − 2π
ûk = u0 + u1 e 4 ik + u2 e 4 i2k + u3 e 4 i3k
− 2π − 2π − 2π
ûk = u0 + u2 e 4 i2k +e 4 ik (u1 + u3 e 4 i2k )
−π
ûk = u0 + u2 e −πik
+e 2 ik (u1 + u3 e−πik )
k = 0, 1, 2, 3
9
Fast Fourier Transform (FFT)
û0 = u0 + u2 e0 + e0 (u1 + u3 e0 )
−π
û1 = u0 + u2 e−πi
+ e 2 i (u1 + u3 e−πi )
û2 = u0 + u2 e−2πi
+ e (u1 + u3 e
−πi
)
−2πi
−3 π
û3 = u0 + u2 e −3πi
+ e 2 i (u1 + u3 e−3πi )
periodicity ⇒ e = e
0 −2πi
= 1, e−πi
=e
−3πi
= −1
u0 û0
u2 û1
u1 û2
u3 û3
10
FFT - Motivation
‣ Signal processing
‣ Convolution, filters
11
FFT - Motivation
N
! −1
uj = ûk eikxj
k=0
N
! −1
∂uj !
∂u
= ikûk eikxj
= ikû
∂x ∂x
k=0
2 N
! −1
∂ uj
2
= −k 2 ûk eikxj
∂x
k=0
12
FFT - Motivation
‣ Advantages
- Spectral accuracy
O(cN ) 0<c<1
‣ Limitations
- Grid constraints
13
CUFFT
‣ Supported by NVIDIA
‣ Features:
‣ cufftHandle
‣ cufftResult
‣ cufftReal
‣ cufftDoubleReal
‣ cufftComplex
‣ cufftDoubleComplex
15
CUFFT - Transform types
16
CUFFT - Plans
‣ cufftPlan1d()
‣ cufftPlan2d()
‣ cufftPlan3d()
‣ cufftPlanMany()
17
CUFFT - Functions
‣ cufftDestroy
18
CUFFT - Performance considerations
‣ Performance recommendations
19
CUFFT - Performance considerations
‣ CUFFT vs FFTW
http://www.sharcnet.ca/~merz/CUDA_benchFFT/ 20
CUFFT - Example
#include
<cufft.h>
#define
NX
256
#define
BATCH
10
cufftHandle
plan;
cufftComplex
*data;
cudaMalloc((void**)&data,
sizeof(cufftComplex)*NX*BATCH);
/*
Create
a
1D
FFT
plan.
*/
cufftPlan1d(&plan,
NX,
CUFFT
C2C,
BATCH);
/*
Use
the
CUFFT
plan
to
transform
the
signal
in
place.
*/
cufftExecC2C(plan,
data,
data,
CUFFT
FORWARD);
/*
Destroy
the
CUFFT
plan.
*/
cufftDestroy(plan);
cudaFree(data);
21
CUFFT - Example
r 2
− 2σ 2 r2
− 2σ2
∇ u=
2
e
σ4
r2
!
uan =e− 2σ2 r = (x − 0.5) 2 + (y − 0.5)2
0.6
u
0.4
0.2
0
1
0.8 1
0.6 0.8
0.4 0.6
0.4
0.2 0.2
y 0 0
x
22
CUFFT - Example
‣ Steps
∇2 u = f
FFT system −k û = fˆ
2
fˆ
Find derivative û = − 2
k
! "
fˆ
Transform back u = ifft − 2
k
k 2 = kx2 + ky2
23
CUFFT - Example
int
main()
{
int
N
=
64;
float
xmax
=
1.0f,
xmin
=
0.0f,
ymin
=
0.0f,
h
=
(xmax‐xmin)/((float)N),
s
=
0.1,
s2
=
s*s;
float
*x
=
new
float[N*N],
*y
=
new
float[N*N],
*u
=
new
float[N*N],
*f
=
new
float[N*N],
*u_a
=
new
float[N*N],
*err
=
new
float[N*N];
float
r2;
for
(int
j=0;
j<N;
j++)
for
(int
i=0;
i<N;
i++)
{
x[N*j+i]
=
xmin
+
i*h;
y[N*j+i]
=
ymin
+
j*h;
r2
=
(x[N*j+i]‐0.5)*(x[N*j+i]‐0.5)
+
(y[N*j+i]‐0.5)*(y[N*j+i]‐0.5);
f[N*j+i]
=
(r2‐2*s2)/(s2*s2)*exp(‐r2/(2*s2));
u_a[N*j+i]
=
exp(‐r2/(2*s2));
//
analytical
solution
}
float
*k
=
new
float[N];
for
(int
i=0;
i<=N/2;
i++)
{
k[i]
=
i
*
2*M_PI;
}
for
(int
i=N/2+1;
i<N;
i++)
{
k[i]
=
(i
‐
N)
*
2*M_PI; 24
}
CUFFT - Example
//
Allocate
arrays
on
the
device
float
*k_d,
*f_d,
*u_d;
cudaMalloc
((void**)&k_d,
sizeof(float)*N);
cudaMalloc
((void**)&f_d,
sizeof(float)*N*N);
cudaMalloc
((void**)&u_d,
sizeof(float)*N*N);
cudaMemcpy(k_d,
k,
sizeof(float)*N,
cudaMemcpyHostToDevice);
cudaMemcpy(f_d,
f,
sizeof(float)*N*N,
cudaMemcpyHostToDevice);
cufftComplex
*ft_d,
*f_dc,
*ft_d_k,
*u_dc;
cudaMalloc
((void**)&ft_d,
sizeof(cufftComplex)*N*N);
cudaMalloc
((void**)&ft_d_k,
sizeof(cufftComplex)*N*N);
cudaMalloc
((void**)&f_dc,
sizeof(cufftComplex)*N*N);
cudaMalloc
((void**)&u_dc,
sizeof(cufftComplex)*N*N);
dim3
dimGrid
(int((N‐0.5)/BSZ)
+
1,
int((N‐0.5)/BSZ)
+
1);
dim3
dimBlock
(BSZ,
BSZ);
real2complex<<<dimGrid,
dimBlock>>>(f_d,
f_dc,
N);
cufftHandle
plan;
cufftPlan2d(&plan,
N,
N,
CUFFT_C2C);
25
CUFFT - Example
cufftExecC2C(plan, f_dc, ft_d, CUFFT_FORWARD);
solve_poisson<<<dimGrid, dimBlock>>>(ft_d, ft_d_k, k_d, N);
cufftExecC2C(plan, ft_d_k, u_dc, CUFFT_INVERSE);
complex2real<<<dimGrid, dimBlock>>>(u_dc, u_d, N);
cudaMemcpy(u, u_d, sizeof(float)*N*N, cudaMemcpyDeviceToHost);
float
constant
=
u[0];
for
(int
i=0;
i<N*N;
i++)
{
u[i]
‐=
constant;
//substract
u[0]
to
force
the
arbitrary
constant
to
be
0
}
26
CUFFT - Example
__global__
void
solve_poisson(cufftComplex
*ft,
cufftComplex
*ft_k,
float
*k,
int
N)
{
int
i
=
threadIdx.x
+
blockIdx.x*BSZ;
int
j
=
threadIdx.y
+
blockIdx.y*BSZ;
int
index
=
j*N+i;
if
(i<N
&&
j<N)
{
float
k2
=
k[i]*k[i]+k[j]*k[j];
if
(i==0
&&
j==0)
{k2
=
1.0f;}
ft_k[index].x
=
‐ft[index].x/k2;
ft_k[index].y
=
‐ft[index].y/k2;
}
}
27
CUFFT - Example
__global__
void
real2complex(float
*f,
cufftComplex
*fc,
int
N)
{
int
i
=
threadIdx.x
+
blockIdx.x*blockDim.x;
int
j
=
threadIdx.y
+
blockIdx.y*blockDim.y;
int
index
=
j*N+i;
if
(i<N
&&
j<N)
{
fc[index].x
=
f[index];
fc[index].y
=
0.0f;
}
}
__global__
void
complex2real(cufftComplex
*fc,
float
*f,
int
N)
{
int
i
=
threadIdx.x
+
blockIdx.x*BSZ;
int
j
=
threadIdx.y
+
blockIdx.y*BSZ;
int
index
=
j*N+i;
if
(i<N
&&
j<N)
{
f[index]
=
fc[index].x/((float)N*(float)N);
//divide
by
number
of
elements
to
recover
value
}
}
28
PyCUDA
‣ http://mathema.tician.de/software/pycuda
29
PyCUDA
‣ Scripting language
‣ PyCUDA
30
PyCUDA
import
pycuda.autoinit
import
pycuda.driver
as
drv
import
numpy
from
pycuda.compiler
import
SourceModule
mod
=
SourceModule("""
__global__
void
multiply_them(float
*dest,
float
*a,
float
*b)
{
const
int
i
=
threadIdx.x;
dest[i]
=
a[i]
*
b[i];
}
""")
multiply_them = mod.get_function("multiply_them")
a
=
numpy.random.randn(400).astype(numpy.float32)
b
=
numpy.random.randn(400).astype(numpy.float32)
dest
=
numpy.zeros_like(a)
multiply_them(
drv.Out(dest),
drv.In(a),
drv.In(b),
block=(400,1,1),
grid=(1,1))
print
dest‐a*b
31
PyCUDA
‣ Transferring data
import
numpy
a
=
a.astype(numpy.float32)
a_gpu
=
cuda.mem_alloc(a.nbytes)
cuda.memcpy_htod(a_gpu, a)
‣ Executing a kernel
from
pycuda.compiler
import
SourceModule
mod
=
SourceModule("""
__global__
void
doublify(float
*a)
{
int
idx
=
threadIdx.x
+
threadIdx.y*4;
a[idx]
*=
2;
}
""")
...
#
Allocate,
generate
and
transfer
func
=
mod.get_function("doublify")
func(a_gpu,
block=(4,4,1))
a_doubled
=
numpy.empty_like(a)
cuda.memcpy_dtoh(a_doubled,
a_gpu)
print
a_doubled
print
a 32