C Programming
C Programming
CSEE W4840
Prof. Stephen A. Edwards
Columbia University
Spring 2013
Goals
Function is correct
Source code is concise, readable, maintainable
Time-critical sections of program run fast enough
Object code is small and efficient
Optimize the use of three resources:
Execution time
Memory
Development/maintenance time
You can say the same thing many different ways and
mean the same thing.
There are many different ways to say the same thing.
The same thing may be said different ways.
There is more than one way to say it.
Many sentences are equivalent.
Be succinct.
Arithmetic
Integer Arithmetic
Fastest
Slower
Very slow
slower
+,
Simple benchmarks
for (i = 0 ; i < 10000 ; ++i)
/* arithmetic operation */
On a Pentium 4 with good hardware floating-point,
Operator
+ (int)
* (int)
/ (int)
<< (int)
Time
Operator
Time
1
5
12
2
+ (double)
* (double)
/ (double)
sqrt
sin
pow
5
5
10
28
48
275
Simple benchmarks
Time
Operator
Time
1
1
7
1
+ (double)
* (double)
/ (double)
sqrt
sin
pow
140
110
220
500
3300
820
C Arithmetic Trivia
Arithmetic Lessons
Bit Manipulation
^
~
>>
<<
Bit-wise AND
Bit-wise OR
Bit-wise XOR
Negate (ones complement)
Right-shift
Left-shift
Bit-manipulation basics
a |= 0x4;
/* Set bit 2 */
b &= ~0x4;
/* Clear bit 2 */
/* Toggle bit 5 */
e >>= 2;
/* Divide e by 4 */
of 1s in c */
0xaa) >> 1);
0xcc) >> 2);
0xf0) >> 4);
Faking Multiplication
Addition, subtraction, and shifting are fast. Can
sometimes supplant multiplication.
Like floating-point, not all processors have a dedicated
hardware multiplier.
Recall the multiplication algorithm from elementary
school, but think binary:
101011
1101
101011
= 43 + 43 << 2 + 43 << 3 = 559
10101100
+101011000
1000101111
Faking Multiplication
Even more clever if you include subtraction:
101011
1110
= 43 << 1 + 43 << 2 + 43 << 3
1010110
= 43 << 4 - 43 << 2
10101100
= 602
+101011000
1001011010
Only useful
for multiplication by a constant
Faking Division
Division is a much more complicated algorithm that
generally involves decisions.
However, division by a power of two is just a shift:
a / 2 = a >> 1
a / 4 = a >> 2
a / 8 = a >> 3
There is no general shift-and-add replacement for
division, but sometimes you can turn it into
multiplication:
a /
=
=
=
1.33333333
a * 0.75
a * 0.5 + a * 0.25
a >> 1 + a >> 2
Multi-way branches
if (a == 1)
foo();
else if (a ==
bar();
else if (a ==
baz();
else if (a ==
qux();
else if (a ==
quux();
else if (a ==
corge();
2)
3)
4)
5)
6)
switch (a) {
case 1:
foo(); break;
case 2:
bar(); break;
case 3:
baz(); break;
case 4:
qux(); break;
case 5:
quux(); break;
case 6:
corge(); break;
}
ldw
cmpnei
bne
call
br
.L2:
ldw
cmpnei
bne
call
br
.L4:
r2, 0(fp)
r2, r2, 1
r2, zero, .L2
foo
.L3
#
#
#
#
#
r2, 0(fp)
r2, r2, 2
r2, zero, .L4
bar
.L3
#
#
#
#
#
.L9:
.L3:
.L4:
.L5:
.L6:
.L7:
.L8:
.L2:
ldw
r2, 0(fp)
# Fetch a
cmpgeui r2, r2, 7
# Compare with 7
bne
r2, zero, .L2
# Branch if greater or equal
ldw
r2, 0(fp)
# Fetch a
muli
r3, r2, 4
# Multiply by 4
movhi
r2, %hiadj(.L9) # Load address .L9
addi
r2, r2, %lo(.L9)
add
r2, r3, r2
# = a * 4 + .L9
ldw
r2, 0(r2)
# Fetch from jump table
jmp
r2
# Jump to label
.section .rodata
.align 2
# Jump table
.long
.L2, .L3, .L4, .L5, .L6, .L7, .L8
.section .text
call
foo
br
.L2
call
bar
br
.L2
call
baz
br
.L2
call
qux
br
.L2
call
quux
br
.L2
call
corge
Function calls
RISC processors strive to make calling cheap by passing
arguments in registers. Calling, entering, and returning:
int foo(int a,
int b) {
int c =
bar(b, a);
return c;
}
foo:
addi
stw
stw
mov
stw
stw
sp,
ra,
fp,
fp,
r4,
r5,
sp, -20
16(sp)
12(sp)
sp
0(fp)
4(fp)
#
#
#
#
#
#
ldw
ldw
call
stw
r4, 4(fp)
r5, 0(fp)
bar
r2, 8(fp)
#
#
#
#
Fetch b
Fetch a
Call bar()
Store result in c
ldw
ldw
ldw
addi
ret
r2,
ra,
fp,
sp,
#
#
#
#
#
Return value in r2 = c
Restore return address
Restore frame pointer
Release stack space
Return from subroutine
8(fp)
16(sp)
12(sp)
sp, 20
Function calls
RISC processors strive to make calling cheap by passing
arguments in registers. Calling, entering, and returning:
int foo(int a,
int b) {
int c =
bar(b, a);
return c;
}
foo:
addi
stw
mov
mov
mov
call
ldw
addi
ret
sp,
ra,
r2,
r4,
r5,
bar
ra,
sp,
sp, -4
0(sp)
r4
r5
r2
0(sp)
sp, 4
(Optimized)
Strength Reduction
struct {
int a;
char b;
int c;
} *fp, *fe, foo[10];
i<10 ; ++i) {
= 77;
= 88;
= 99;
fe = foo + 10;
for (fp = foo ; fp != fe ; ++fp) {
fp->a = 77;
fp->b = 88;
fp->c = 99;
}
r2,
r2,
r2,
r3,
r3,
r2,
r2,
r3,
r2,
r2,
r3,
r3,
r2,
r2,
r2,
r3,
r2,
r2,
0(fp)
r2, 10
zero, .L1
%hiadj(foo)
r3, %lo(foo)
0(fp)
r2, 12
r2, r3
77
0(r3)
%hiadj(foo)
r3, %lo(foo)
0(fp)
r2, 12
r2, r3
r2, 4
88
0(r3)
#
#
#
#
Fetch i
i >= 10?
exit if true
Get address of foo array
# Fetch i
# i * 12
# foo[i]
# foo[i].a = 77
# compute &foo[i]
# offset for b field
# foo[i].b = 88
r3,
r2,
r3,
r3,
r2,
r2,
r3,
r2,
r2,
r3,
r2,
r2,
r2,
r2,
r2,
.L2
0(fp)
4(fp)
r2, .L1
0(fp)
77
0(r3)
0(fp)
88
4(r3)
0(fp)
99
8(r3)
0(fp)
r2, 12
0(fp)
# fp
# fe
# fp == fe?
# fp->a = 77
# fp->b = 88
# fp->c = 99
# ++fp
movi
movi
movi
movhi
addi
movi
.L5:
addi
stw
stb
stw
addi
bne
ret
r6,
r5,
r4,
r2,
r2,
r3,
77
# Load constants
88
99
%hiadj(foo) # Load address of array
r2, %lo(foo)
10
# iteration count
r3,
r6,
r5,
r4,
r2,
r3,
r3, -1
0(r2)
4(r2)
8(r2)
r2, 12
zero, .L5
#
#
#
#
#
#
decrement iterations
foo[i].a = 77
foo[i].b = 88
foo[i].c = 99
go to next array element
if there are more to do
movhi
addi
addi
movi
movi
movi
.L5:
stw
stb
stw
addi
bne
ret
r6,
r6,
r2,
r5,
r4,
r3,
%hiadj(foo+120)
r6, %lo(foo+120)
r6, -120
77
88
99
# fe = foo + 10
r5,
r4,
r3,
r2,
r2,
0(r2)
4(r2)
8(r2)
r2, 12
r6, .L5
#
#
#
#
#
# fp = foo
# Constants
fp->a
fp->b
fp->c
++fp
fp ==
= 77
= 88
= 99
fe?
6 1024
1
50MHz
= 0.12s or 12 1024
1
50MHz
= 0.24s
Double-checking
r2,
r3,
r4,
r5,
r3,
r2,
0(r4)
r3, -1
r4, 4
r5, r2
zero, .L5
r5
#
#
#
#
#
#
#
Fetch b[i]
--i
next b element
a += b[i]
repeat if i > 0
result
cycles
2-7
1
1
1
3
Storage Classes in C
/* fixed address: visible to other files */
int global_static;
/* fixed address: only visible within file */
static int file_static;
/* parameters always stacked */
int foo(int auto_param)
{
/* fixed address: only visible to function */
static int func_static;
/* stacked: only visible to function */
int auto_i, auto_a[10];
/* array explicitly allocated on heap */
double *auto_d = malloc(sizeof(double)*5);
/* return value in register or stacked */
return auto_i;
}
free(
free(
free(
malloc(
free(
malloc(
S N
S N
S N
S
malloc(
S N
)
S N
S
malloc(
S N
S N
)
S N
S N
S
malloc(
S N
S N
)
free( )
S N
S N
S
malloc(
S N
)
S
free( )
S
S N
S N
S N
Danger of fragmentation
Memory-Mapped I/O
#define ADDRESS \
((char *) 0x1800)
#define VADDRESS \
((volatile char *) 0x1800)
char foo()
char a =
char b =
return a
}
{
*ADDRESS;
*ADDRESS;
+ b;
char bar()
char a =
char b =
return a
}
{
*VADDRESS;
*VADDRESS;
+ b;
Compiled with
optimization:
foo:
movi
ldbu
add
andi
ret
r2,
r2,
r2,
r2,
6144
0(r2)
r2, r2
r2, 0xff
bar:
movi
ldbu
ldbu
add
andi
ret
r3,
r2,
r3,
r2,
r2,
6144
0(r3)
0(r3)
r2, r3
r2, 0xff
Altera I/O
/* Definitions of alt_u8, etc. */
#include "alt_types.h"
/* IORD_ALTERA_AVALON... for the PIO device */
#include "altera_avalon_pio_regs.h"
/* Auto-generated addresses for all peripherals */
#include "system.h"
int main() {
alt_u8 sw;
for (;;) {
sw = IORD_ALTERA_AVALON_PIO_DATA(SWITCHES_BASE);
IOWR_ALTERA_AVALON_PIO_DATA(LEDS_BASE, sw);
}
}
Unix Signals
The Unix environment provides signals, which behave
like interrupts.
#include <stdio.h>
#include <signal.h>
void handleint() {
printf("Got an INT\n");
/* some variants require this */
signal(SIGINT, handleint);
}
int main() {
/* Register signal handler */
signal(SIGINT, handleint);
/* Do nothing forever */
for (;;) { }
return 0;
}
#include "system.h"
#include "altera_avalon_pio_regs.h"
#include "alt_types.h"
static void button_isr(void* context, alt_u32 id)
{
/* Read and store the edge capture register */
*(volatile int *) context =
IORD_ALTERA_AVALON_PIO_EDGE_CAP(BUTTON_PIO_BASE);
/* Write to the edge capture register to reset it */
IOWR_ALTERA_AVALON_PIO_EDGE_CAP(BUTTON_PIO_BASE, 0);
/* Reset interrupt capability for the Button PIO */
IOWR_ALTERA_AVALON_PIO_IRQ_MASK(BUTTON_PIO_BASE, 0xf);
}
Debugging Skills