Modern Computer Architecture and Programming in Assembly Language - TCM - 183 - 1309076
Modern Computer Architecture and Programming in Assembly Language - TCM - 183 - 1309076
• Computer Systems:
A Programmer's
Perspective, 2/E
(CS:APP2e)
Carnegie Mellon
University
Course organization
• Online lectures
– http://asmcourse.cs.msu.ru/
• Online workshops
– http://algcourse.cs.msu.su/teachwiki/
• Online labs
– http://earth.ispras.ru
Agenda
void f() {
static int cntr = 0; // 1
int x = 2, y = 1, z = 0; // 2
unsigned short w = 282; // 3
signed char q = 13; // 4
++cntr; // 5
z = -x + q * w *y - w; // 6
}
Data retrieval
int x = 2, y = 1, z = 0;
unsigned short w = 282;
signed char q = 13;
x = 2;
y = 1;
z = 0;
++cntr;
z = -x + q * w *y - w;
Data transfer
Positive Negative
overflow overflow
x + y x - y
Signed overflow diagram
Positive
overflow
x - y
Negative
overflow
Negative
overflow
x + y
Positive
overflow
Arithmetic instructions: flags
OF SF ZF PF CF
ADD M M M M M
SUB M M M M M
ADC M M M M TM
SBB M M M M TM
IMUL M - - - M
IDIV - - - - -
NEG M M M M M
array layout
Push onto stack
Pop off stack
Stack frame
%include ‘io.inc’
int main() { section .text
int a = 1, b = 2, c;
c = sum(a, b); global CMAIN
return 0; CMAIN:
} mov DWORD [ebp-16],0x1 ; (1)
mov DWORD [ebp-12],0x2 ; (2)
int sum(int x, int y) { mov eax,DWORD [ebp-12] ; (3)
int t = x + y; mov DWORD [esp+4],eax ; (4)
return t; mov eax,DWORD [ebp-16] ; (5)
} mov DWORD [esp],eax ; (6)
call sum ; (7)
mov DWORD [ebp-8],eax ; (8)
global sum
sum:
push ebp ; (9)
mov ebp,esp ; (10)
sub esp,0x10 ; (11)
mov edx,DWORD [ebp+12] ; (12)
mov eax,DWORD [ebp+8] ; (13)
add eax,edx ; (14)
mov DWORD [ebp-4],eax ; (15)
mov eax,DWORD [ebp-4] ; (16)
mov esp, ebp ; (17)
pop ebp ; (18)
ret ; (19)
64-bit addition
long long f1(long long a, long long b) {
long long c;
c = a + b;
return c;
}
; …
mov eax, DWORD [ebp+16] ; (1)
mov edx, DWORD [ebp+20] ; (2)
add eax, DWORD [ebp+8] ; (3)
adc edx, DWORD [ebp+12] ; (4)
; …
64-bit addition
64-bit addition: data flow
64-bit subtraction
; …
mov eax, DWORD [ebp+8] ; (1)
mov edx, DWORD [ebp+12] ; (2)
sub eax, DWORD [ebp+16] ; (3)
sbb edx, DWORD [ebp+20] ; (4)
; …
64-bit subtraction: data flow
long long f2(long long a, globаl f2
long long b) { f2:
long long c; push ebp
c = a * b; mov ebp, esp
return c; sub esp, 8
} mov DWORD [esp], ebx ; (1)
mov ecx, DWORD [ebp+20] ; (2)
mov ebx, DWORD [ebp+8] ; (3)
mov DWORD [esp+4], esi ; (4)
mov eax, DWORD [ebp+12] ; (5)
mov esi, DWORD [ebp+16] ; (6)
imul ecx, ebx ; (7)
imul eax, esi ; (8)
add ecx, eax ; (9)
mov eax, esi ;(10)
mul ebx ;(11)
mov ebx, DWORD [esp] ;(12)
lea esi, [ecx+edx] ;(13)
mov edx, esi ;(14)
mov esi, DWORD [esp+4] ;(15)
mov esp, ebp
pop ebp
ret
64-bit multiplication
64-bit multiplication: data flow
Contest #1: expression evaluation
• 7 word problems
• Solve 5 problems for grade «excellent»
• Submit via e-judge:
http://earth.ispras.ru/cgi-bin/new-client?contest_id=150&locale_id=0
• Sample problem
– «Watch out for overflow»
Contest #1: «Watch out for overflow»
&(r->a[i]);
Structure field access
unsigned copy(unsigned u) {
return u;
}
Data Alignment
typedef struct {
int i;
char c;
int j;
} trifield1; // (2)
typedef struct {
int i;
int j;
char c;
} trifield2; // (3)
Logical Instructions
global sha256_f1
sha256_f1:
push ebp
mov ebp, esp
mov edx, DWORD [ebp+8] ; (1)
pop ebp ; (2)
mov eax, edx ; (3)
mov ecx, edx ; (4)
ror eax, 13 ; (5)
ror ecx, 2 ; (6)
xor eax, ecx ; (7)
ror edx, 22 ; (8)
xor eax, edx ; (9)
ret
Special arithmetic
int arith(int x,
int y, ; …
int z) { mov eax, dword [ebp + 16] ; (1)
int t1 = x + y; lea eax, [eax + 2 * eax] ; (2)
int t2 = z * 48; sal eax, 4 ; (3)
int t3 = t1 & 0xFFFF; mov edx, dword [ebp + 12] ; (4)
int t4 = t2 * t3; add edx, dword [ebp + 8] ; (5)
return t4; and edx, 65535 ; (6)
} imul eax, edx ; (7)
; …
Bit field
JE ZF Equal / Zero
JNE ~ZF Not Equal / Not Zero
JS SF Negative
JNS ~SF Non-negative
JG ~(SF^OF)&~ZF Greater (signed)
JGE ~(SF^OF) Greater or Equal (signed)
JL (SF^OF) Less (signed)
JLE (SF^OF)|ZF Less or Equal (signed)
JA ~CF&~ZF Above (unsigned)
JB CF Below (unsigned)
int absdiff(int x, int y) { absdiff:
int result; push ebp
if (x > y) { mov ebp, esp
result = x-y; mov edx, dword [8 + ebp] ; (1)
} else { mov eax, dword [12 + ebp] ; (2)
result = y-x; cmp edx, eax ; (3)
} jle .L6 ; (4)
return result; sub edx, eax ; (5)
} mov eax, edx ; (6)
jmp .L7 ; (7)
.L6: ; (8)
sub eax, edx ; (9)
.L7: ; (10)
pop ebp
ret
int goto_ad(int x, int y) { absdiff:
int result; push ebp
if (x <= y) goto Else; mov ebp, esp
result = x-y; mov edx, dword [8 + ebp] ; (1)
goto Exit; mov eax, dword [12 + ebp] ; (2)
Else: cmp edx, eax ; (3)
result = y-x; jle .L6 ; (4)
Exit: sub edx, eax ; (5)
return result; mov eax, edx ; (6)
} jmp .L7 ; (7)
.L6: ; (8)
sub eax, edx ; (9)
.L7: ; (10)
pop ebp
ret
val = Test ? Then_Expr : Else_Expr;
x loaded in edi
y loaded in esi
absdiff:
mov edx, edi
sub edx, esi ; tmp_val:edx = x-y
mov eax, esi
sub eax, edi ; result:eax = y-x
cmp edi, esi ; Compare x:y
cmovg eax, edx ; If >, result:eax = tmp_val:edx
ret
int pcount_do(unsigned x) { int pcount_do(unsigned x)
int result = 0; {
do { int result = 0;
result += x & 0x1; loop:
x >>= 1; result += x & 0x1;
} while (x); x >>= 1;
return result; if (x)
} goto loop;
return result;
}
int pcount_do(unsigned x) mov ecx, 0 ; result = 0
{ .L2: ; loop:
int result = 0; mov eax, edx
loop: and eax, 1 ; t = x & 1
result += x & 0x1; add ecx, eax ; result += t
x >>= 1; shr edx, 1 ; x >>= 1
if (x) jne .L2 ; If !0, goto loop
goto loop;
return result;
}
• Register allocation:
edx x
ecx result
int pcount_while(unsigned x) { int pcount_do(unsigned x) {
int result = 0; int result = 0;
while (x) { if (!x) goto done;
result += x & 0x1; loop:
x >>= 1; result += x & 0x1;
} x >>= 1;
return result; if (x)
} goto loop;
done:
return result;
}
int pcount_do(unsigned x) {
int result = 0;
loop:
if (!x) goto done;
result += x & 0x1;
x >>= 1;
goto loop;
done:
return result;
}
#define WSIZE 8*sizeof(int)
int pcount_for(unsigned x) {
int i;
int result = 0;
for (i = 0; i < WSIZE; i++) {
unsigned mask = 1 << i;
result += (x & mask) != 0;
}
return result;
}
#define WSIZE 8*sizeof(int) int pcount_for_gt(unsigned x) {
int i;
int pcount_for(unsigned x) { int result = 0;
int i; i = 0;
int result = 0; if (!(i < WSIZE))
for (i = 0; i < WSIZE; i++) { goto done;
unsigned mask = 1 << i; loop:
result += (x & mask) != 0; {
} unsigned mask = 1 << i;
return result; result += (x & mask) != 0;
} }
i++;
if (i < WSIZE)
goto loop;
done:
return result;
}
#define WSIZE 8*sizeof(int) int pcount_for_gt(unsigned x) {
int i;
int pcount_for(unsigned x) { int result = 0;
int i; i = 0;
int result = 0; if (!(i < WSIZE))
for (i = 0; i < WSIZE; i++) { goto done;
unsigned mask = 1 << i; loop:
result += (x & mask) != 0; {
} unsigned mask = 1 << i;
return result; result += (x & mask) != 0;
} }
i++;
if (i < WSIZE)
goto loop;
done:
return result;
}
int fib(int x) { // x >= 1 fib:
int i; push ebp
int predpred = 0; mov ebp, esp
int pred = 1; push ebx
int res = 1;
x--; mov ecx, dword [ebp + 8] ; x
for (i = 0; i < x; i++) { xor edx, edx ; predpred
res = predpred + pred; mov ebx, 1 ; pred
predpred = pred; mov eax, 1 ; res
pred = res; dec ecx
}
return res; jecxz .end
} .loop:
lea eax, [edx + ebx]
mov edx, ebx
mov ebx, eax
loop .loop
.end:
pop ebx
pop ebp
ret
int fib(int x) { // x >= 1 fib:
int i; push ebp
int predpred = 0; mov ebp, esp
int pred = 1; push ebx
int res = 1;
x--; mov ecx, dword [ebp + 8] ; x
for (i = 0; i < x; i++) { xor edx, edx ; predpred
res = predpred + pred; mov ebx, 1 ; pred
predpred = pred; mov eax, 1 ; res
pred = res; dec ecx
}
return res; jecxz .end
} .loop:
lea eax, [edx + ebx]
mov edx, ebx
mov ebx, eax
loop .loop
.end:
pop ebx
pop ebp
ret
int fib(int x) { // x >= 1 fib:
int i; push ebp
int predpred = 0; mov ebp, esp
int pred = 1; push ebx
int res = 1;
x--; mov ecx, dword [ebp + 8] ; x
for (i = 0; i < x; i++) { xor edx, edx ; predpred
res = predpred + pred; mov ebx, 1 ; pred
predpred = pred; mov eax, 1 ; res
pred = res; dec ecx
}
return res; jecxz .end
} .loop:
lea eax, [edx + ebx]
mov edx, ebx
mov ebx, eax
loop .loop
.end:
pop ebx
pop ebp
ret
int fib(int x) { // x >= 1 fib:
int i; push ebp
int predpred = 0; mov ebp, esp
int pred = 1; push ebx
int res = 1;
x--; mov ecx, dword [ebp + 8] ; x
for (i = 0; i < x; i++) { xor edx, edx ; predpred
res = predpred + pred; mov ebx, 1 ; pred
predpred = pred; mov eax, 1 ; res
pred = res; dec ecx
}
return res; jecxz .end
} .loop:
lea eax, [edx + ebx]
mov edx, ebx
mov ebx, eax
loop .loop
.end:
pop ebx
pop ebp
ret
•Integer values
– Stored and processed in general purpose registers
– Signed/unsigned values
Intel ASM Bytes C
byte b 1 [unsigned] char
word w 2 [unsigned] short
double word d 4 [unsigned] int
quad word q 8 [unsigned] long long int
•Floating-point values
– Stored and processed in special floating-point registers
Intel ASM Bytes C
Single d 4 float
Double q 8 double
• Arrays — layout in memory
T A[L];
– Array of elements of type T, array length is L
– Stored in a contiguous memory block of size L *
sizeof(T) bytes
char string[12];
x x + 12
int val[5];
x x+4 x+8 x + 12 x + 16 x + 20
double a[3];
x x+8 x + 16 x + 24
char *p[3];
x x+4 x+8 x + 12
•Array element access
T A[L];
– Array of elements of type T, array length is L
– The identifier A can be used as a pointer to element 0. Pointer type is T*
int val[5]; 1 5 2 1 3
x x+4 x+8 x + 12 x + 16 x + 20
• Reference Type Value
val[4] int 3
val int * x
val+1 int * x+4
&val[2] int * x+8
val[5] int ??
*(val+1) int 5
val + i int * x+4i
#define ZLEN 5
typedef int zip_dig[ZLEN];
zip_dig cmu = { 1, 5, 2, 1, 3 };
zip_dig mit = { 0, 2, 1, 3, 9 };
zip_dig ucb = { 9, 4, 7, 2, 0 };
zip_dig cmu; 1 5 2 1 3
16 20 24 28 32 36
zip_dig mit; 0 2 1 3 9
36 40 44 48 52 56
zip_dig ucb; 9 4 7 2 0
56 60 64 68 72 76
; edx = z
; eax = dig Element address is
mov eax, dword [edx+4*eax] # z[dig] edx + 4 * eax
void zincr(zip_dig z) {
int i;
for (i = 0; i < ZLEN; i++)
z[i]++;
}
; edx = z
mov eax, 0 ; eax = i
.L4: ; loop:
add dword [edx + 4 * eax], 1 ; z[i]++
add eax, 1 ; i++
cmp eax, 5 ; i vs. 5
jne .L4 ; if (!=) goto loop
void zincr_p(zip_dig z) { void zincr_v(zip_dig z) {
int *zend = z+ZLEN; void *vz = z;
do { int i = 0;
(*z)++; do {
z++; (*((int *) (vz+i)))++;
} while (z != zend); i += ISIZE;
} } while (i != ISIZE*ZLEN);
}
; edx = z = vz
movl eax, 0 ; i = 0
.L8: ; loop:
add dword [edx + eax], 1 ; Increment vz+i
add eax, 4 ; i += 4
cmp eax, 20 ; i vs. 20
jne .L8 ; if (!=) goto loop
#define PCOUNT 4
zip_dig pgh[PCOUNT] =
{{1, 5, 2, 0, 6},
{1, 5, 2, 1, 3 },
{1, 5, 2, 1, 7 },
{1, 5, 2, 2, 1 }};
zip_dig
1 5 2 0 6 1 5 2 1 3 1 5 2 1 7 1 5 2 2 1
pgh[4];
int A[R][C];
A A A A A A
[0] • • • [0] [1] • • • [1] • • • [R-1] • • • [R-1]
[0] [C-1] [0] [C-1] [0] [C-1]
4*R*C bytes
• Row access
– A[i] is an array of C elements
– Each element of type T requires K bytes
– Start address of row i
A + i * (C * K)
int A[R][C];
A A A A A A
[0] ••• [0] • • • [i] ••• [i] • • • [R-1] ••• [R-1]
[0] [C-1] [0] [C-1] [0] [C-1]
A A+i*C*4 A+(R-1)*C*4
int *get_pgh_zip(int index){ #define PCOUNT 4
return pgh[index]; zip_dig pgh[PCOUNT] =
} {{1, 5, 2, 0, 6},
{1, 5, 2, 1, 3 },
{1, 5, 2, 1, 7 },
{1, 5, 2, 2, 1 }};
; eax = index
lea eax, [eax + 4 * eax] ; 5 * index
lea eax, [pgh + 4 * eax] ; pgh + (20 * index)
int A[R][C];
A A A A A
[0] ••• [0] • • • ••• [i] ••• • • • [R-1] ••• [R-1]
[0] [C-1] [j] [0] [C-1]
A A+i*C*4 A+(R-1)*C*4
A+i*C*4+j*4
int get_pgh_digit (int index, int dig) {
return pgh[index][dig];
}
– Address is calculated as
pgh + 4*((index+4*index)+dig)
zip_dig cmu = { 1, 5, 2, 1, 3 }; • The univ variable is an
zip_dig mit = { 0, 2, 1, 3, 9 }; array of 3 elements
zip_dig ucb = { 9, 4, 7, 2, 0 }; • Each element is a 4-byte
pointer
#define UCOUNT 3
int *univ[UCOUNT] = {mit, cmu, ucb}; • Each pointer references
an array of ints
cmu
1 5 2 1 3
univ
16 20 24 28 32 36
160 36 mit
0 2 1 3 9
164 16
168 56 ucb 36 40 44 48 52 56
9 4 7 2 0
56 60 64 68 72 76
int get_univ_digit (int index, int dig) {
return univ[index][dig];
}
•Similar in C
•Significant difference in assembly
Mem[pgh+20*index+4*dig] Mem[Mem[univ+4*index]+4*dig]
N x N matrix #define N 16
typedef int fix_matrix[N][N];
• Fixed dimensions /* Get element a[i][j] */
– N is known at compile time int fix_ele
(fix_matrix a, int i, int j){
return a[i][j];
}
/*
• Calculations Fetch of array column j
– Process all elements in */
column j void fix_column
• Optimization (fix_matrix a, int j, int *dest)
{
– Fetch individual
int i;
elements of the column
for (i = 0; i < N; i++)
dest[i] = a[i][j];
}
Optimizing array element access
• Optimization
– Calculate ajp = &a[i][j]
• Initial value is
a + 4*j /* Fetch of array column j */
• Step is 4*N void fix_column
(fix_matrix a, int j, int *dest)
Register Value {
int i;
ecx ajp for (i = 0; i < N; i++)
ebx dest dest[i] = a[i][j];
edx i }
.L8: ; loop:
mov eax, dword [ecx] ; get *ajp
mov dword [ebx + 4 * edx], eax ; store in dest[i]
add edx, 1 ; i++
add ecx, 64 ; ajp += 4*N
cmp edx, 16 ; i vs. N
jne .L8 ; if !=, goto loop
Optimizing array element access
– Calculate ajp = &a[i][j]
• Initial value is
a + 4*j
• Step is 4*n /* Fetch of array column j */
void var_column
Register Value
(int n, int a[n][n],
ecx ajp int j, int *dest)
edi dest {
edx i int i;
for (i = 0; i < n; i++)
ebx 4*n dest[i] = a[i][j];
esi n }
.L18: ; loop:
mov eax, dword [ecx] ; get *ajp
mov dword [edi + 4 * edx], eax ; store in dest[i]
add edx, 1 ; i++
add ecx, ebx ; ajp += 4*n
cmp esi, edx ; n vs. i
jg .L18 ; if (>) goto loop
Optimizing array element access
.L18: ; loop:
mov eax, dword [ecx] ; get *ajp
mov dword [edi + 4 * edx], eax ; store in dest[i]
add edx, 1 ; i++
add ecx, ebx ; ajp += 4*n
cmp esi, edx ; n vs. i
jg .L18 ; if (>) goto loop
Optimizing array element access
• 5 word problems
• 2 reverse engineering problems
• Solve any 5 problems for grade «excellent», but at least one
reverse engineering problem.
• Submit via e-judge:
- http://earth.ispras.ru/cgi-bin/new-client?contest_id=151&locale_id=0
- http://earth.ispras.ru/cgi-bin/new-client?contest_id=152&locale_id=0
• Sample word problem
– «Local extrema»
• Sample reverse engineering problem
– «R2»
Contest #2: «Local extrema»
Let us define local minimum of an integer sequence to be such an
element that is strictly less than both its neighbors. Let us define local
maximum of an integer sequence to be such an element that is
strictly greater than both its neighbors.
• Parameters placement
– Integer
• Actual value
– Pointer -> Integer
• Actual value
– Array -> Pointer
• Reference
– Structure/union
• Actual value
Function main
#include <stdio.h> CMAIN:
lea ecx, [esp+4]
int v; and esp, -16
void nullify(int argc, char* argv[]); push dword [ecx-4]
push ebp
int main(int argc, char* argv[]) { mov ebp, esp
nullify(argc, argv); push ecx
return 0; sub esp, 20
} mov eax, dword [ecx+4]
mov dword [esp+4], eax
void nullify(int argc, char* argv[]) { mov eax, dword [ecx]
} mov dword [esp], eax
call nullify
mov eax, 0
add esp, 20
pop ecx
pop ebp
lea esp, [ecx-4]
ret
nullify:
ret
Stack alignment
STDCALL
__attribute__((stdcall))
int sum(int x, int y) {
int t = x + y;
return t;
}
STDCALL
__attribute__((stdcall))
int sum(int x, int y) {
int t = x + y;
return t;
}
FASTCALL
__attribute__((fastcall)) int
sum(int x, int y) {
int t = x + y;
return t;
}
Omit frame pointer
#include <stdarg.h>
• 5 word problems
• 2 reverse engineering problems
• Solve any 5 problems for grade «excellent», but at least one
reverse engineering problem.
• Submit via e-judge
- http://earth.ispras.ru/cgi-bin/new-client?contest_id=153&locale_id=0
- http://earth.ispras.ru/cgi-bin/new-client?contest_id=154&locale_id=0
• Sample word problem
– «GCD of Four»
• Sample reverse engineering problem
– «R3»
Contest #3: «GCD of Four»
• 10 problems
• Grading policy
– Max 6 point for each problem: 60 points total
• Grade «excellent» >= 48 points (0.8)
• Grade «good» >= 36 points (0.6)
• Grade «poor» >= 24 points (0.4)
Sample problem #1
Fill in register AL value in hex and in decimal (signed and unsigned), and
values of flags CF, OF, ZF and SF after execution of the following
instructions.
Let register EAX contain a positive integer x <= 224. Write out
two variants, both consisting of a single assembly
instruction, that multiply x by 5. The result is to remain in
EAX. Two variants are considered distinct if mnemonics of
the used instructions are different.
Answer 1:
Answer 2:
Sample problem #4
int x, y;
x /= -y;
Sample problem #6
int A[N][N],