CUDA Threads and Block Scheduling
CUDA Threads and Block Scheduling
+.-?35@D@E=
F Streaming Multiprocessor (SM)
&'()*+,-./01
F Streaming Processors GH8I
2.0Ĝ34,*2.5.67*8,9(9 /D1@*4/DD.0*CUDA Cores
8:-0:.*'53;.-13<=
(.>/-<?.5<*@A*&@?>:<.-*B-/>,341
F )*H8*>[email protected]*<,-./01*
J.D@5E35E*<@*/*JD@4K*G1,/-.0*-.1@:-4.1I
C*2.0-34,*2.5.1
+.-?35@D@E= L@M*3<*M@-K1
!I B-30*31*D/:54,.0
%I 2D@4K1*/-.*/113E5.0*<@*1<-./?35E*
?:D<3>[email protected]@-1*GHNI*@5*JD@4KOJ=OJD@4K*
J/131*35*/-J3<-/-=*@-0.-*Gscalability)
GP/4,*HN*4/5*>[email protected]*?@-.*JD@4K1I
.9E97*B+%""*4/5*0@*?/Q*R*JD@4K1*@-*
?/Q*!"%S*<,-./01*>.-*HN
C*2.0-34,*2.5.1 C*2.0-34,*2.5.1
!
!"#!$#%"!!
L@M*3<*M@-K1 2/134*&@5130.-/<3@51
TI*)5*/113E5.0*JD@4K*31*>/-<3<[email protected]*35<@*warps. F <,.*13W.*@A*/*JD@4K*31*D3?3<.0*<@*V!%*<,-./01
+,.3-* .Q.4:<3@5*31*35<.-D./;.0 blockDim(512,1,1)
blockDim(8,16,2)
blockDim(16,16,2)
SI*U/->1*/-.*/113E5.0*<@*HN
G@5.*<,-./0*<@*@5.*H8I
F K.-5.D*4/5*,/50D.*:>*<@*
$V7VT$Q$V7VT$*JD@4K1
VI*U/->1*4/5*J.*0.D/=.0*3A*30D.*A@-*1@?.*
-./1@5*GM/3<35E*A@-*?.?@-=I
C*2.0-34,*2.5.1 C*2.0-34,*2.5.1
BR"*)-4,3<.4<:-. B+%""*)-4,3<.4<:-.
,/1*16 SMs ,/1*30 SMs
./4,*4/5*>[email protected] ./4,*4/5*>[email protected]
*R*JD@4K1* *R*JD@4K1*
@- @-
*768 threads 1024 threads
?/QX*RQ!$Y!%R*&'()*&@-.1*GH81I ?/QX*RQT"Y%S"*&'()*&@-.1*GH81I
?/QX*!$QZ$RY!%7%RR*<,-./01 ?/QX*T"Q!7"%SY T"7Z%"*<,-./01
C*2.0-34,*2.5.1 C*2.0-34,*2.5.1
%
!"#!$#%"!!
B+%""*)-4,3<.4<:-. B+%""*)-4,3<.4<:-.
T"7Z%"*<,-./01*?/Q
%S"*&'()*[email protected]
One SM limits:
!"%S*<,-./01*Y*SQ%V$*@-*
RQ!%R*.<49
One block limits:
V!%*<,-./01*Y*%Q%V$*@-
RQ$S*.<49
[?/E.*C*\;303/
C*2.0-34,*2.5.1 C*2.0-34,*2.5.1
[?/E.*C*\;303/
B+S""*G].-?3I 2D@4K*)113E5?.5<
,/1*16 SM F 3A*?@-.*<,/5*<,.*?/Q*/?@:5<*@A*
./4,*4/5*>[email protected]* JD@4K1*/-.*/113E5.0*<@*HN
*R*JD@4K1* <,.=*M3DD*J.*14,.0:D.0*A@-*D/<.-*.Q.4:<3@5
!*HN*,/1* 32 cuda cores
<@</DX*512 cuda cores F [1*3<*E@@0*@-*J/0^*
U.DD7*3<*0.>.5017*J:<*:1:/DD=*E@@09
>D:1*16kb or 48kb L1 Caches per SM
4/5*-:5*<M@*03AA.-.5<*M/->1*>.-*K.-5.D
G0:/D*M/->*14,.0:D.-I
C*2.0-34,*2.5.1 C*2.0-34,*2.5.1
T
!"#!$#%"!!
&@?>/-31@5 U/->1
F )*<,-./0*JD@4K*31*03;30.0*35<@*warps
G80 GT200 GT500
+-/5131<@-1 $R!*?3D3@51 !9S*J3DD3@51 T9"*J3DD3@51 F )*JD@4K*@A*T%*<,-./01*
&'()*&@-.1 !%R %S" V!% G,M*0.>.50.5<*/50*4/5*4,/5E.I*
U/-> 14,.0:D.-* ! ! %
>.-*HN F Warps are scheduling units of SM
H,/-0*N.?@-=* !$K2 !$K2 !$*@-*SRK2
>.- HN F M/->"X*<"7<!7`7<T!
_!*4/4,.*>.-*HN \@5. \@5. !$*@-*SRK2 M/->!X*<T%7<T%7`7<$T
_%*4/4,.*>.-*HN \@5. \@5. Z$R*K2
_@/0#1<@-.* T%J T%J $SJ
/00-.11*M30<,
C*2.0-34,*2.5.1 C*2.0-34,*2.5.1
U/->1 U/->1
F Example: F Example2:
T*JD@4K1*/113E5.0*<@*HN7* How many warps in the GT200?
./4,*M3<,*!%R*<,-./019*
How many warps we have in the SM? F !"%S*<,-./01#T%*GM/->*D.5E<,IYT%*M/->1
F !%R*<,-./01#T%*GM/->*D.5E<,IYS*M/->1
F SGM/->1I*Q*T*GJD@4K1I*Y*!%*
M/->1*/<*<,.*1/?.*<3?.*
C*2.0-34,*2.5.1 C*2.0-34,*2.5.1
S
!"#!$#%"!!
V
!"#!$#%"!!
$
!"#!$#%"!!
C*2.0-34,*2.5.1 C*2.0-34,*2.5.1
U/->1#JD@4K*/D3E5?.5< U/->1#JD@4K*/D3E5?.5<
F 1D Case F 2D Case t0,0 t1,0 t2,0 t3,0 t4,0 t5,0 t6,0 t7,0 t8,0
t0,1 t1,1 t2,1 t3,1 t4,1 t5,1 t6,1 t7,1 t8,1
JD@4K*@A*!""*<,-./01*a ,@M*?/5=*M/->1^ JD@4K(3?Gi7iI*
t0,2 t1,2 t2,2 t3,2 t4,2 t5,2 t6,2 t7,2 t8,2
!""#T%YTg!#S R!*<,-./01* t0,3 t1,3 t2,3 t3,3 t4,3 t5,3 t6,3 t7,3 t8,3
t0 t1 … t31 t32 t33 … t63 t64 t65 … t92 t93 t94 t95 t96 t97 t98 t99
!""#T%Y%*M/->1* t0,4 t1,4 t2,4 t3,4 t4,4 t5,4 t6,4 t7,4 t8,4
t0,5 t1,5 t2,5 t3,5 t4,5 t5,5 t6,5 t7,5 t8,5
/50*!Z*<,-./01 t0,6 t1,6 t2,6 t3,6 t4,6 t5,6 t6,6 t7,6 t8,6
M" M! M% h*@A*MT t0,7 t1,7 t2,7 t3,7 t4,7 t5,7 t6,7 t7,7 t8,7
M M
t0,8 t1,8 t2,8 t!3,8 t4,8 t5,8 t6,8 t7,8 t8,8
%
F <,.*D/1<*M/->*M3DD*J.*@44:>3.0*.5<3-.D=7*J:<*
t0,0 t1,0 … t4,3 t5,3 t6,3 … t0,7 t64 t65 t8,8
@5D=*<,.*R*<,-./01*M3DD*,/;.*?./535E* …
Z
!"#!$#%"!!
U/->1#JD@4K*/D3E5?.5< U/->*.Q.4:<3@5
F 3D Case t0,0 t1,0 t2,0 t3,0,4
t0,0 t1,0 t2,0 t3,0,3
F H[N+*a 135ED.*351<-:4<3@57*?:D<3>D.*<,-./01
t0,1 t t1,1t t2,1 t3,1,4
JD@4K(3?GS7S7VI*
t0,0 t1,0
t0,0 t1,0t0,1 t t1,12,0 t2,13,0,2
2,0 tt1,2
t0,2
t
t2,23,1,3
t3,2,4
<,.*1/?.*351<-:4<3@5*31*J-@/04/1<.0*<@*/DD*
t t t2,13,0,1
t3,1,2
t0,0,0 t1,0,0tt0,2
2,0,0tt1,2
<,-./01*/50*.Q.4:<.*/<*<,.*1/?.*<3?.*35*
0,1 1,1
3,0,0t2,2 t3,2,3
R"*<,-./01* t0,1 t1,1 t0,3
t0,2 t1,2t0,3
t2,1 tt1,3
2,1,0t1,3
t 3,1,0
3,1,1t2,3 t3,3,4
t0,2t t1,2t t2,2 t3,2,2
t0,1,0 t1,1,0 t2,3 t3,3,3
t3,2,1
!""#T%Y%*M/->1* t0,3t t1,32,2
t0,2,0 t1,2,0 t t2,3 t3,3,2
t0,3 t1,32,2,0t2,33,2,0
t3,3,1
<,.*HN9*
t0,3,0 t1,3,0 t2,3,0 t3,3,0
/50*!$*<,-./01
F )DD*H81*35*<,.*HN*
t0,0,0 t1,0,0 … t3,3,1 t0,0,2 t1,0,2 … t3,3,3 t0,0,4 t1,0,4 … t3,3,4
.Q.4:<.*<,.*1/?.*351<-:4<3@59
M" GT%I M!GT%I MTG!$I
C*2.0-34,*2.5.1 C*2.0-34,*2.5.1
+,-./0*(3;.-E.54. +,-./0*(3;.-E.54.
F L@M*4/5*/DD*<,-./01*.Q.4:<.*<,.*1/?.* F +,.*4@?>3D.-*M3DD*:5-@DD*J@<,*J-/54,.1*
351<-:4<3@5*3A*M.*,/;.*<,.*j3Ak*4@??/50^ /50*<,.*B8'*M3DD*>.-A@-?*both J-/54,.19*
Example: +,.5*35*<,.*A3-1<*>/117*.D1.*35*<,.*1.4@509
if (threadIdx.x<10) F 2:<*5@<*/DD*3A1*4/:1.*<,-./0*03;.-E.54.c
{a[0]=10;}
else {a[1]=10;} a=tex2D(tex,u,v);
if (a<0.5)
+,-./01*l"Oim*M3DD*0@*jthen” {a[0]=10;}
else {a[1]=10;}
<,.*@<,.-1*M3DD*0@*jelse”
+,31*31*4/DD.0*thread divergence
C*2.0-34,*2.5.1 C*2.0-34,*2.5.1
R
!"#!$#%"!!
+,-./0*(3;.-E.54. +,-./0*(3;.-E.54.
F U,/<*4/:1.1*<,-./0*03;.-E.54.^ Example:
for (int i=0;i<threadIdx.x;i++)
a[i]=i;
!I [A*1</<.?.5<1*M3<,*A:54<3@51*@A*<,-./0[0Q )DD*D@@>1*<,/<*1,@:D0*A3531,.0*M3DD*A3531,7*J:<*
%I _@@>1*M3<,*A:54<3@51*@A**<,-./0[0Q
3A1*/-.*.Q>.513;.*/5=M/=` <,.*B8'*M3DD*3<.-/<.*A@-*<,.*@<,.-1*<3DD*<,.*.50
C*2.0-34,*2.5.1 C*2.0-34,*2.5.1
n./035E
F \o[([)*&'()*8-@E-/??35E*B:30.
F p3-K7*(9297*LM:7*U9U97*
Programming Massively
Parallel Processors,
\o[([)7*
N@-E/5*p/:A?/55*%"!"
C*2.0-34,*2.5.1