Packet Sniffer Code in C Using Linux Sockets
Packet Sniffer Code in C Using Linux Sockets
In the previous part we made a simple sniffer which created a raw socket and started receiving on it. But it had few drawbacks:
1. Could sniff only incoming data.
2. Could sniff only TCP or UDP or ICMP or any one protocol packets at a time.
3. Provided IP frames, so Ethernet headers were not available.
In this article we are going to modify the same code to fix the above 3 drawbacks. However we shall not be using libpcap. This will
be done using pure Linux sockets. The difference is very small and is 2 lines :
Instead of :
sock_raw = socket(AF_INET , SOCK_RAW , IPPROTO_TCP);
1
We do :
sock_raw = socket( AF_PACKET , SOCK_RAW , htons(ETH_P_ALL)) ;
//Optional
//setsockopt(sock_raw , SOL_SOCKET , SO_BINDTODEVICE , "eth0" , strlen("eth0")+ 1 );
1
2
3
Now it will:
1. Sniff both incoming and outgoing traffic.
2. Sniff ALL ETHERNET FRAMES, which includes all kinds of IP packets and even more if there are any.
3. Provides the Ethernet headers too, which contain the mac addresses.
The setsockopt line is optional. Its important to provide the correct interface name to setsockopt , eth0 in this case and in most cases.
So may be you would like to present the user with a list of interfaces available and allow him to choose the one to be sniffed.
AGAM NOTE: Can we use this method with the adapter in promiscuous mode and capture everything on the wire? Libpcap
will do this but can this method do it?
//Provides declarations
//Provides declarations
//Provides declarations
//Provides declarations
//For ETH_P_ALL
//For ether_header
ProcessPacket(unsigned char* ,
print_ip_header(unsigned char*
print_tcp_packet(unsigned char
print_udp_packet(unsigned char
int);
, int);
* , int );
* , int );
for
for
for
for
icmp header
udp header
tcp header
ip header
if(sock_raw < 0) {
//Print the error with proper message
perror("Socket Error");
return 1;
}
while(1) {
saddr_size = sizeof saddr;
//Receive a packet
data_size = recvfrom(sock_raw , buffer , 65536 , 0 , &saddr ,
(socklen_t*) &saddr_size);
if(data_size <0 ) {
printf("Recvfrom error , failed to get packets\n");
return 1;
}
//Now process the packet
ProcessPacket(buffer , data_size);
}
close(sock_raw);
printf("Finished");
return 0;
//TCP Protocol
++tcp;
print_tcp_packet(buffer , size);
break;
case 17: //UDP Protocol
++udp;
print_udp_packet(buffer , size);
break;
default: //Some Other Protocol like ARP etc.
++others;
break;
}
printf("TCP : %d
UDP : %d
ICMP : %d
IGMP : %d
, udp ,
icmp , igmp , others , total);
}
Others : %d
fprintf(logfile , "\n");
fprintf(logfile , "Ethernet Header\n");
fprintf(logfile , "
|-Destination Address : %.2X-%.2X-%.2X-%.2X-%.2X-%.2X \n",
eth->h_dest[0],eth->h_dest[1],eth->h_dest[2],eth->h_dest[3],eth->h_dest[4],
eth->h_dest[5] );
fprintf(logfile , "
|-Source Address
: %.2X-%.2X-%.2X-%.2X-%.2X-%.2X \n",
eth->h_source[0] ,
eth->h_source[1] , eth->h_source[2] , eth->h_source[3] , eth->h_source[4] ,
eth->h_source[5] );
fprintf(logfile , "
|-Protocol
: %u \n",(unsigned short)eth->h_proto);
+ sizeof(struct ethhdr) );
memset(&source, 0, sizeof(source));
source.sin_addr.s_addr = iph->saddr;
memset(&dest, 0, sizeof(dest));
dest.sin_addr.s_addr = iph->daddr;
fprintf(logfile , "\n");
fprintf(logfile , "IP Header\n");
fprintf(logfile , "
|-IP Version
: %d\n",(unsigned int)iph->version);
fprintf(logfile , "
|-IP Header Length : %d DWORDS or %d Bytes\n",
(unsigned int)iph->ihl, ((unsigned int)(iph->ihl))*4);
fprintf(logfile , "
|-Type Of Service
: %d\n",(unsigned int)iph->tos);
fprintf(logfile , "
|-IP Total Length
: %d Bytes(Size of Packet)\n",
ntohs(iph->tot_len));
fprintf(logfile , "
|-Identification
: %d\n",ntohs(iph->id));
//fprintf(logfile , " |-Reserved ZERO Field
: %d\n",
(unsigned int)iphdr>ip_reserved_zero);
//fprintf(logfile , " |-Dont Fragment Field
: %d\n",
(unsigned int)iphdr->ip_dont_fragment);
+ sizeof(struct ethhdr) );
fprintf(logfile , "\n###########################################################");
sizeof(struct ethhdr));
+ sizeof(struct ethhdr));
,
,
,
,
,
"\nUDP Header\n");
"
|-Source Port
"
|-Destination Port
"
|-UDP Length
"
|-UDP Checksum
:
:
:
:
%d\n"
%d\n"
%d\n"
%d\n"
,
,
,
,
ntohs(udph->source));
ntohs(udph->dest));
ntohs(udph->len));
ntohs(udph->check));
fprintf(logfile , "\n");
fprintf(logfile , "IP Header\n");
PrintData(Buffer , iphdrlen);
fprintf(logfile , "UDP Header\n");
PrintData(Buffer+iphdrlen , sizeof udph);
fprintf(logfile , "Data Payload\n");
//Move the pointer ahead and reduce the size of string
PrintData(Buffer + header_size , Size - header_size);
}
fprintf(logfile , "\n###########################################################");
+ sizeof(struct ethhdr));
fprintf(logfile ,
fprintf(logfile ,
//fprintf(logfile
//fprintf(logfile
fprintf(logfile ,
"
|-Code : %d\n",(unsigned int)(icmph->code));
"
|-Checksum : %d\n",ntohs(icmph->checksum));
, "
|-ID
: %d\n",ntohs(icmph->id));
, "
|-Sequence : %d\n",ntohs(icmph->sequence));
"\n");
");
***********************TCP Packet*************************
Ethernet Header
|-Source Address
: 00-1C-C0-F8-79-EE
|-Protocol
: 8
IP Header
|-IP Version
: 4
: 5 DWORDS or 20 Bytes
|-Type Of Service
: 0
: 141
|-Identification
: 13122
|-TTL
Bytes(Size of Packet)
: 64
|-Protocol : 6
|-Checksum : 45952
|-Source IP
: 192.168.1.6
|-Destination IP
: 74.125.71.125
TCP Header
|-Source Port
: 33655
|-Sequence Number
: 78458457
|-Header Length
: 5 DWORDS or 20 BYTES
|-Urgent Flag
: 0
|-Acknowledgement Flag : 1
|-Push Flag
: 1
|-Reset Flag
: 0
|-Synchronise Flag
: 0
|-Finish Flag
: 0
|-Window
: 62920
|-Checksum
: 21544
|-Urgent Pointer : 0
DATA Dump
IP Header
00 25 5E 1A 3D F1 00 1C C0 F8 79 EE 08 00 45 00
.%^.=.....y...E.
00 8D 33 42
..3B
TCP Header
40 00 40 06 B3 80 C0 A8 01 06 4A 7D 47 7D 83 77
@.@..?....J}G}.w
14 66 04 AD
.f..
Data Payload
17 03 01 00 60 A0 9C 5D 14 A1 25 AB CE 8B 7C EB
....`..]..%...|.
1A A4 43 A6 60 DD E8 6B 6E 43 C1 94 6A D2 25 23
..C.`..knC..j.%#
03 98 59 67 1A 2C 07 D3 7E B2 B8 9F 83 38 4C 69
..Yg.,..~....8Li
D3 3A 8E 0D 9E F0 6B CE 9E 6B F4 E1 BD 9E 50 53
.:....k..k....PS
6D F6 AB 11 05 D6 41 82 F0 03 0C A6 E2 48 2B 71
m.....A......H+q
16 81 FF 5B DF 50 D4 5B AD 90 04 5E 4C 94 E7 9B
...[.P.[...^L...
0B 72 7E 32 88
.r~2.
###########################################################
In the above log we can see the Ethernet headers being printed. They show the source and destination mac address along with the
packet protocol. 8 means IP protocol
Note:
1. If you want to sniff only IP and ARP packets for example then you can try this:
sock_raw = socket( AF_PACKET , SOCK_RAW , htons(ETH_P_IP|ETH_P_ARP)) ;
The complete list of protocols is found in /usr/include/linux/if_ether.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
/*
* These are the defined Ethernet Protocol ID's.
*/
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
ETH_P_LOOP 0x0060
ETH_P_PUP
0x0200
ETH_P_PUPAT 0x0201
ETH_P_IP
0x0800
ETH_P_X25
0x0805
ETH_P_ARP
0x0806
ETH_P_BPQ
0x08FF
ETH_P_IEEEPUP
0x0a00
ETH_P_IEEEPUPAT 0x0a01
ETH_P_DEC
0x6000
ETH_P_DNA_DL
0x6001
ETH_P_DNA_RC
0x6002
ETH_P_DNA_RT
0x6003
/*
/*
/*
/*
/*
/*
/*
/*
/*
/*
/*
/*
/*
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
Enjoy!!
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
ETH_P_LAT
0x6004 /* DEC LAT
*/
ETH_P_DIAG
0x6005 /* DEC Diagnostics
*/
ETH_P_CUST
0x6006 /* DEC Customer use
*/
ETH_P_SCA
0x6007 /* DEC Systems Comms Arch
*/
ETH_P_TEB
0x6558 /* Trans Ether Bridging
*/
ETH_P_RARP
0x8035 /* Reverse Addr Res packet */
ETH_P_ATALK 0x809B
/* Appletalk DDP
*/
ETH_P_AARP 0x80F3
/* Appletalk AARP
*/
ETH_P_8021Q 0x8100
/* 802.1Q VLAN Extended Header */
ETH_P_IPX
0x8137
/* IPX over DIX
*/
ETH_P_IPV6 0x86DD
/* IPv6 over bluebook
*/
ETH_P_PAUSE 0x8808
/* IEEE Pause frames. See 802.3 31B */
ETH_P_SLOW 0x8809
/* Slow Protocol. See 802.3ad 43B */
ETH_P_WCCP 0x883E
/* Web-cache coordination protoc draft-wilson-wrec-wccp-v2-00.txt */
ETH_P_PPP_DISC 0x8863 /* PPPoE discovery messages
*/
ETH_P_PPP_SES
0x8864 /* PPPoE session messages
*/
ETH_P_MPLS_UC
0x8847 /* MPLS Unicast traffic
*/
ETH_P_MPLS_MC
0x8848 /* MPLS Multicast traffic
*/
ETH_P_ATMMPOA
0x884c /* MultiProtocol Over ATM
*/
ETH_P_LINK_CTL 0x886c /* HPNA, wlan link local tunnel */
ETH_P_ATMFATE
0x8884 /* Frame-based ATM Transport
* over Ethernet
*/
ETH_P_PAE
0x888E
/* Port Access Entity (IEEE 802.1X) */
ETH_P_AOE
0x88A2
/* ATA over Ethernet
*/
ETH_P_TIPC 0x88CA
/* TIPC
*/
ETH_P_1588 0x88F7
/* IEEE 1588 Timesync */
ETH_P_FCOE 0x8906
/* Fibre Channel over Ethernet */
ETH_P_FIP
0x8914
/* FCoE Initialization Protocol */
ETH_P_EDSA 0xDADA
/* Ethertype DSA [ NOT AN OFFICIALLY REGISTERED ID ] */
/*
* Non DIX types. Won't clash for 1500 types.
*/
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
#define
ETH_P_802_3 0x0001
ETH_P_AX25 0x0002
ETH_P_ALL
0x0003
ETH_P_802_2 0x0004
ETH_P_SNAP 0x0005
ETH_P_DDCMP
0x0006
ETH_P_WAN_PPP
0x0007
ETH_P_PPP_MP
0x0008
ETH_P_LOCALTALK 0x0009
ETH_P_CAN
0x000C
ETH_P_PPPTALK
0x0010
ETH_P_TR_802_2 0x0011
ETH_P_MOBITEX
0x0015
ETH_P_CONTROL
0x0016
ETH_P_IRDA 0x0017
ETH_P_ECONET
0x0018
ETH_P_HDLC 0x0019
ETH_P_ARCNET
0x001A
ETH_P_DSA
0x001B
ETH_P_TRAILER
0x001C
ETH_P_PHONET
0x00F5
ETH_P_IEEE802154 0x00F6
ETH_P_CAIF 0x00F7
/*
/*
/*
/*
/*
/*
/*
/*
/*
1 Objective
2 Background
3 Scenario
4 Method
4.1 Overview
5 Alternatives
6 Further reading
Tested on
Debian (Lenny)
Ubuntu (Lucid, Trusty)
Objective
To send an arbitrary Ethernet frame using an AF_PACKET socket
Background
Ethernet is a link layer protocol. Most networking programs interact with the network stack at the
transport layer or above, so have no need to deal with Ethernet frames directly, but there are some
circumstances where interaction at a lower level may be necessary. These include:
implementation of Ethernet-based protocols that are not built in to the network stack, and
production of malformed or otherwise non-standard frames for testing purposes.
Scenario
Suppose that you wish to send an ARP request for the IP address 192.168.0.83. The request is to be
sent from interface eth0 to the broadcast MAC adddress.
(ARP is the Address Resolution Protocol. It is used when a host needs to send a datagram to a given
IP address, but does not know which MAC address corresponds to that IP address.)
Method
Overview
The method described here has five steps:
1. Select the required EtherType.
2. Create the AF_PACKET socket.
3. Determine the index number of the Ethernet interface to be used.
4. Construct the destination address.
5. Send the Ethernet frame.
The following header files are used:
Header
Used by
<errno.h>
<string.h>
errno
<arpa/inet.h>
in_addr_t, htons
<net/ethernet.h>
<net/if.h>
<netinet/if_ether.h
>
<netpacket/packet.h
>
<sys/ioctl.h>
<sys/socket.h>
ETHER_ADDR_LEN, ETH_P_*
struct ifreq
struct ether_arp
struct sockaddr_ll
SIOCGIFINDEX, ioctl
struct sockaddr, struct iovec, struct
sockets are specific to Linux. Programs that make use of them need elevated privileges
in order to run.
AF_PACKET
Setting SO_BROADCAST does not appear to be necessary when sending broadcast frames using
an AF_PACKET socket. Some programs do so anyway, which is unlikely to be harmful, and could be
considered a worthwhile hedge against any future change in behaviour.
0x88b5
socket
function.
int fd=socket(AF_PACKET,SOCK_DGRAM,htons(ETH_P_ARP));
if (fd==-1) {
die("%s",strerror(errno));
}
For further details of this method see the microHOWTO Get the index number of a Linux network
interface in C using SIOCGIFINDEX.
Frames can in principle be sent using any function that is capable of writing to a file descriptor,
however if you have opted for the link-layer header to be constructed automatically then it will be
necessary to use either sendto or sendmsg so that a destination address can be specified. Of
these sendmsg is the more flexible option, but at the cost of a significantly more complex interface.
Details of each function are given below.
Regardless of which function you choose, each function call will result in a separate datagram
being sent. For this reason you must either compose each datagram payload as a single, contiguous
block of memory, or make use of the scatter/gather capability provided by sendmsg.
In this particular scenario the payload to be sent is an ARP request. For completeness, here is an
example of how such a payload might be constructed:
struct ether_arp req;
req.arp_hrd=htons(ARPHRD_ETHER);
req.arp_pro=htons(ETH_P_IP);
req.arp_hln=ETHER_ADDR_LEN;
req.arp_pln=sizeof(in_addr_t);
req.arp_op=htons(ARPOP_REQUEST);
memset(&req.arp_tha,0,sizeof(req.arp_tha));
You will need to set req.arp_tpa to contain the IP address (in network byte order) for which you
want to find the corresponding MAC address. For example, starting from a string in dotted quad
format:
const char* target_ip_string="192.168.0.83";
struct in_addr target_ip_addr={0};
if (!inet_aton(target_ip_string,&target_ip_addr)) {
die("%s is not a valid IP address",target_ip_string);
}
memcpy(&req.arp_tpa,&target_ip_addr.s_addr,sizeof(req.arp_tpa));
You will also need to set source_ip_addr and source_hw_addr to contain the IP and MAC addresses
of the interface from which the request will be sent (in network byte order). See the
microHOWTOs Get the IP address of a network interface in C using SIOCGIFADDR and Get the
MAC address of an Ethernet interface in C using SIOCGIFHWADDR for details of how to obtain
these given the interface name.
The fourth argument is for specifying flags which modify the behaviour of sendto, none of which
are needed in this example.
The value returned by sendto is the number of bytes sent, or -1 if there was an
error. AF_PACKET frames are sent atomically, so unlike when writing to a TCP socket there is no need
to wrap the function call in a loop to handle partially-sent data.
The purpose of the iovec array is to provide a scatter/gather capability so that the datagram payload
need not be stored in a contiguous region of memory. In this example the entire payload is stored in
a single buffer, therefore only one array element is needed.
The msghdr structure exists to bring the number of arguments to recvmsg and sendmsg down to a
managable number. On entry to sendmsg it specifies where the destination address, the datagram
payload and any ancillary data are stored. In this example no ancillary data has been provided.
If you wish to pass any flags into sendmsg then this cannot be done using msg_flags, which is
ignored on entry. Instead you must pass them using the third argument to sendmsg (which is zero in
this example).
Alternatives
Using libpcap
See:
libpcap is a cross-platform library for capturing traffic from network interfaces. It also has the
ability to send, so provides broadly the same functionality as a packet socket (and on Linux, is
implemented using a packet socket).
The main advantage of using libpcap is that it abstracts away differences between the operating
systems that it supports, thereby allowing relatively portable code to be written. This involves some
loss of functionality, and that may make libpcap unsuitable for use in some circumstances, but
otherwise it is recommended in preference to AF_PACKET sockets on the grounds of portability.
Raw sockets differ from packet sockets in that they operate at the network layer as opposed to the
link layer. For this reason they are limited to network protocols for which raw socket support has
been explicitly built into the network stack, but they also have a number of advantages which result
from operating at a higher level of abstraction:
You can write code that will work with any suitable type of network interface.
Routing and link-layer address resolution are handled for you.
The network layer header is constructed for you unless you request otherwise.
The raw socket API has been partially standardised by POSIX, whereas AF_PACKET sockets
are specific to Linux.
For these reasons, use of a raw socket is recommended unless you specifically need the extra
functionality provided by working at the link layer.
Further reading
packet(7) (Linux manpage)
1 Objective
2 Background
3 Scenario
4 Method
4.1 Overview
5 Variations
5.1 Sending to the IPv4 broadcast address
6 Alternatives
6.1 Sending at the link layer
7 See also
8 Further reading
Tested on
Debian (Lenny)
Ubuntu (Lucid)
Objective
To send an arbitrary IPv4 datagram using a raw socket in C
Background
Most programs that communicate using the Internet Protocol do so through a transport-layer
protocol such as TCP or UDP and have no need to deal directly with Internet Protocol datagrams,
but there are some circumstances where it is necessary to interact with the network stack at a lower
level. These include:
implementation of transport-layer protocols that are not built in to the
network stack, and
production of malformed or otherwise non-standard datagrams for testing
purposes.
Scenario
Suppose that you wish to send an ICMP echo request to a given IPv4 address. (This is what
the ping command does to determine whether there is a reachable host at that address.)
There is no POSIX API call that provides this functionality per se. You therefore intend to assemble
an ICMP message with the required content then send it as the payload of an IP datagram using a
raw socket.
Method
Overview
The method described here has five steps:
1. Select the required protocol number.
2. Create the raw socket.
3. Optionally, set the
IP_HDRINCL
socket option.
<errno.h>
<unistd.h>
<netdb.h>
<sys/socket.h>
<netinet/in.h>
Note that POSIX-compatible operating systems are not obliged to support raw sockets at all, and
the API that has been fully standardised is quite restrictive. For this reason it is often necessary for
programs that use raw sockets to stray into the realm of implementation-defined behaviour. They
are also likely to require elevated privileges in order to run.
There are several sources from which protocol numbers can be obtained:
Some protocol numbers are defined as constants by the API. POSIX
defines IPPROTO_TCP, IPPROTO_UDP and IPPROTO_ICMP, and glibc defines many
more.
Protocol numbers can be looked up at run time by calling the
function getprotobyname.
IANA maintains a list of assigned protocol numbers.
Unlike a TCP or UDP port number there is little risk of an assigned IP protocol number ever
needing to change, especially for a widely-used protocol such as ICMP. For this reason there is no
real need to look up the protocol number at runtime, and it is quite reasonable for the required value
to be hard-coded.
For this particular example there is a symbolic constant, IPPROTO_ICMP, that all POSIX-compatible
operating systems are supposed to provide. The simplest solution would be to use that. If you
instead want to call getprotobyname then this can be done as follows:
const char* protocol_name="icmp";
struct protoent* protocol=getprotobyname(protocol_name);
if (!protocol) {
die("Protocol %s not found",protocol_name);
}
int protocol_number=protocol->p_proto;
socket
function.
protocol
An alternative to specifying the protocol number as the third argument is to use the
value IPPROTO_RAW. POSIX does not generally allow this, but some implementations use it as a
wildcard or a dummy value. (In the case of Linux it allows any protocol to be sent (with headers)
but nothing can be received.)
In this instance the socket will be used for sending ICMP messages, therefore the third argument
should be set to IPPROTO_ICMP:
int fd=socket(AF_INET,SOCK_RAW,IPPROTO_ICMP);
if (fd==-1) {
die("%s",strerror(errno));
}
Support for IP_HDRINCL is quite common, but the details vary as to:
the byte order that should be used for each of the header fields (which is
not necessarily the same for all fields), and
which fields (if any) are filled in automatically.
Some operating systems set IP_HDRINCL implicitly when IPPROTO_RAW is selected (on the grounds
that it would make little sense not to supply a header in that case) but others require an explicit call
to setsockopt. If you want to enable header inclusion then it is prudent to set it regardless, in order
to accommodate either behaviour.
This makes use of the icmphdr structure provided by glibc and the ip_checksum function described
in the microHOWTO Calculate an Internet Protocol checksum in C. Note that sizeof(req) cannot
be used to obtain the size of the payload because struct icmphdr is not specific to echo requests, so
the constant req_size has been defined for this purpose.
The fourth argument is for specifying flags which modify the behaviour of sendto, none of which
are needed in this example.
The value returned by sendto is the number of bytes sent, or -1 if there was an error. Raw
datagrams are sent atomically, so unlike when writing to a TCP socket there is no need to wrap the
function call in a loop to handle partially-sent data.
The purpose of the iovec array is to provide a scatter/gather capability so that the datagram payload
need not be stored in a contiguous region of memory. In this example the entire payload is stored in
a single buffer, therefore only one array element is needed.
The msghdr structure exists to bring the number of arguments to recvmsg and sendmsg down to a
managable number. On entry to sendmsg it specifies where the destination address, the datagram
payload and any ancillary data are stored. In this example no ancillary data has been provided.
If you wish to pass any flags into sendmsg then this cannot be done using msg_flags, which is
ignored on entry. Instead you must pass them using the third argument to sendmsg (which is zero in
this example).
Variations
Sending to the IPv4 broadcast address
By default, attempts to send a datagram to the broadcast address are rejected with an error
(typically EACCES, however it is not obvious from the POSIX specification which error should
occur). This is a safety measure intended to reduce the risk of making unintended broadcasts. It can
be overridden by setting the SO_BROADCAST socket option:
int broadcast=1;
if (setsockopt(fd,SOL_SOCKET,SO_BROADCAST,
&broadcast,sizeof(broadcast))==-1) {
die("%s",strerror(errno));
}
Alternatives
Sending at the link layer
See:
Raw sockets of the type described above operate at the network layer. An alternative would be to
inject packets at the link layer, for example in the form of Ethernet frames. This can be done using
libpcap or (on Linux-based systems) using an AF_PACKET socket.
This approach makes it possible to implement any network-layer protocol, whether or not it is
explicitly supported by the network stack, but also brings a number of disadvantages which result
from operating at a lower level of abstraction:
The sender must construct the network layer header, and depending on
the method of injection, perhaps also the link layer header.
The sender must take responsibility for routing and link-layer address
resolution (although it may be possible to delegate these tasks back to
the operating system rather than implementing them from scratch).
The above cannot normally be done without knowledge of the link layer
protocol, which will typically need to be coded into the sending program
on a case-by-case basis.
For these reasons, use of a raw socket is recommended unless you specifically need the extra
functionality provided by working at the link layer.
See also
Send a UDP datagram in C
Establish a TCP connection in C
Further reading
raw(7) (Linux manpage)
The Open Group, sendto, Base Specifications Issue 6
The Open Group, sendmsg, Base Specifications Issue 6
ithilgore, SOCK_RAW Demystified, May 2008
1 Objective
2 Background
3 Scenario
4 Method
4.1 Overview
5 Example program
6 Alternatives
7 Further reading
Tested on
Debian (Lenny, Squeeze)
Ubuntu (Lucid)
Objective
To send an arbitrary Ethernet frame using libpcap
Background
Ethernet is a link layer protocol. Most networking programs interact with the network stack at the
transport layer or above, so have no need to deal with Ethernet frames directly, but there are some
circumstances where interaction at a lower level may be necessary. These include:
implementation of Ethernet-based protocols that are not built in to the
network stack, and
production of malformed or otherwise non-standard frames for testing
purposes.
Scenario
Suppose that you wish to send an ARP request for a given IP address from a given Ethernet
interface. You wish to use libpcap to perform the sending.
(ARP is the Address Resolution Protocol. It is used when a host needs to send a datagram to a given
IP address, but does not know which MAC address corresponds to that IP address. It is described
in RFC 826.)
Method
Overview
The method described here has five steps:
1. Select the required EtherType.
2. Construct the Ethernet frame.
3. Obtain a PCAP descriptor by calling
pcap_open_live.
pcap_inject.
pcap_close.
Used by
<stdio.h>
fprintf
<stdlib.h>
exit
<pcap.h>
Be aware that:
Not all network devices are Ethernet interfaces, or use an Ethernetcompatible frame format, or support packet injection using libpcap.
Although a link-layer header must be supplied, libpcap does not promise
to use it as-is: both the source address and the EtherType are at risk of
being altered.
Programs that send raw packets, using this or any other method, are likely to require elevated
privileges in order to run.
0x88b5
The first argument to pcap_open_live is the name of the interface from which the Ethernet frame is
to be sent, for example eth0. (Remember that not all interfaces are suitable for sending Ethernet
frames.)
The second, third and fourth arguments are the snapshot length, promiscuous mode flag and
timeout. These control how packets are captured, and for the task in hand it is unimportant what
values are used, but if you want to capture as well as send then you will need to ensure that they
have been set appropriately (especially the snapshot length).
The last argument points to a buffer for returning error messages, which must be at
least PCAP_ERRBUF_SIZE bytes long. As suggested on thepcap_open_live manpage, this has been set
to the empty string before the function call then inspected afterwards in order to detect both
warnings and errors.
The value returned by pcap_inject is the number of bytes sent, or -1 if there was an error. In the
latter case a human-readable error message can be obtained using pcap_geterr or (as in this
example) printed using pcap_perror.
Example program
The following example program constructs and sends an ARP request using the method described
above:
send_arp.c
When invoked it takes two arguments, the name of the Ethernet interface and the (numeric) IP
address to which the ARP request should be directed:
./send_arp eth0 192.168.0.83
Alternatives
Using an AF_PACKET socket
See:
Raw sockets differ from packet sockets in that they operate at the network layer as opposed to the
link layer. For this reason they are limited to network protocols for which raw socket support has
been explicitly built into the network stack, but they also have a number of advantages which result
from operating at a higher level of abstraction:
You can write code that will work with any suitable type of network
interface.
Routing and link-layer address resolution are handled for you.
The network layer header is constructed for you unless you request
otherwise.
The raw socket API has been partially standardised by POSIX.
For these reasons, use of a raw socket is recommended unless you specifically need the extra
functionality provided by working at the link layer.
Further reading
PCAP(3) (libpcap manpage)
memset(header.ether_dhost,0xff,sizeof(header.ether_dhost));
// Construct ARP request (except for MAC and IP addresses).
struct ether_arp req;
req.arp_hrd=htons(ARPHRD_ETHER);
req.arp_pro=htons(ETH_P_IP);
req.arp_hln=ETHER_ADDR_LEN;
req.arp_pln=sizeof(in_addr_t);
req.arp_op=htons(ARPOP_REQUEST);
memset(&req.arp_tha,0,sizeof(req.arp_tha));
// Convert target IP address from string, copy into ARP request.
struct in_addr target_ip_addr={0};
if (!inet_aton(target_ip_string,&target_ip_addr)) {
fprintf(stderr,"%s is not a valid IP address",target_ip_string);
exit(1);
}
memcpy(&req.arp_tpa,&target_ip_addr.s_addr,sizeof(req.arp_tpa));
// Write the interface name to an ifreq structure,
// for obtaining the source MAC and IP addresses.
struct ifreq ifr;
size_t if_name_len=strlen(if_name);
if (if_name_len<sizeof(ifr.ifr_name)) {
memcpy(ifr.ifr_name,if_name,if_name_len);
ifr.ifr_name[if_name_len]=0;
} else {
fprintf(stderr,"interface name is too long");
exit(1);
}
// Open an IPv4-family socket for use when calling ioctl.
int fd=socket(AF_INET,SOCK_DGRAM,0);
if (fd==-1) {
perror(0);
exit(1);
}
// Obtain the source IP address, copy into ARP request
if (ioctl(fd,SIOCGIFADDR,&ifr)==-1) {
perror(0);
close(fd);
exit(1);
}
struct sockaddr_in* source_ip_addr = (struct sockaddr_in*)&ifr.ifr_addr;
memcpy(&req.arp_spa,&source_ip_addr->sin_addr.s_addr,sizeof(req.arp_spa));
// Obtain the source MAC address, copy into Ethernet header and ARP request.
if (ioctl(fd,SIOCGIFHWADDR,&ifr)==-1) {
perror(0);
close(fd);
exit(1);
}
if (ifr.ifr_hwaddr.sa_family!=ARPHRD_ETHER) {
fprintf(stderr,"not an Ethernet interface");
close(fd);
exit(1);
}
const unsigned char* source_mac_addr=(unsigned char*)ifr.ifr_hwaddr.sa_data;
memcpy(header.ether_shost,source_mac_addr,sizeof(header.ether_shost));
memcpy(&req.arp_sha,source_mac_addr,sizeof(req.arp_sha));
close(fd);
// Combine the Ethernet header and ARP request into a contiguous block.
unsigned char frame[sizeof(struct ether_header)+sizeof(struct ether_arp)];
memcpy(frame,&header,sizeof(struct ether_header));
memcpy(frame+sizeof(struct ether_header),&req,sizeof(struct ether_arp));
// Open a PCAP packet capture descriptor for the specified interface.
char pcap_errbuf[PCAP_ERRBUF_SIZE];
pcap_errbuf[0]='\0';
pcap_t* pcap=pcap_open_live(if_name,96,0,0,pcap_errbuf);
if (pcap_errbuf[0]!='\0') {
fprintf(stderr,"%s\n",pcap_errbuf);
}
if (!pcap) {
exit(1);
}
// Write the Ethernet frame to the interface.
if (pcap_inject(pcap,frame,sizeof(frame))==-1) {
pcap_perror(pcap,0);
pcap_close(pcap);
exit(1);
}
// Close the PCAP descriptor.
pcap_close(pcap);
return 0;
}
1 Objective
2 Scenario
3 Method
3.1 Overview
3.2 Create an ifreq structure for passing data in and out of ioctl
4 See also
5 Further reading
Tested on
Debian (Lenny, Squeeze)
Ubuntu (Lucid, Precise)
Objective
To get the MAC address of an Ethernet interface in C using the ioctl command SIOCGIFHWADDR
Scenario
Suppose you wish to display the MAC address of an Ethernet interface. The variable
to a null-terminated string containing the name of the interface (for example, eth0).
Method
Overview
On Linux-based systems the MAC address of an interface can be obtained using
the ioctl command SIOCGIFHWADDR. The method described here has five steps:
1. Create an
ifreq
ioctl.
<errno.h>
<string.h>
<stdio.h>
<sys/ioctl.h>
<net/if.h>
<net/if_arp.h>
ifreq
structure.
ioctl.
if_name
points
Invoke ioctl
Once you have the ifreq structure and socket descriptor then you are ready to invoke ioctl:
if (ioctl(fd,SIOCGIFHWADDR,&ifr)==-1) {
int temp_errno=errno;
close(fd);
die("%s",strerror(temp_errno));
}
close(fd);
If this completes without error then the hardware address of the interface should have been returned
in ifr.ifr_hwaddr in the form of a struct sockaddr.
Other possible values of sa_family for different types of network interface can be found in the
header file <net/if_arp.h>, each beginning with the prefix ARPHRD_. Note that for some of these
(such as ARPHRD_LOOPBACK) there is no hardware address as such.
See also
Get the IP address of a network interface in C using SIOCGIFADDR
Further reading
netdevice(7) (Linux manpage)
Content
1 Objective
2 Scenario
3 Method
3.1 Overview
3.2 Create an ifreq structure for passing data in and out of ioctl
3.3 Provide an open socket descriptor with the address family AF_INET
4 See also
5 Further reading
Tested on
Debian (Lenny)
Ubuntu (Precise, Trusty)
Objective
To get the IPv4 address of a network interface in C using the ioctl command SIOCGIFADDR
Scenario
Suppose that you wish to display the IPv4 address of a network interface. The
variable if_name points to a null-terminated string containing the name of the interface (for
example, eth0).
Method
Overview
On Linux-based systems, one way to obtain the IPv4 address of an interface is to use
the ioctl command SIOCGIFADDR. The method described here has four steps:
1. Create an
ifreq
ioctl.
AF_INET.
3. Invoke
ioctl.
ifreq
structure.
The following header files are needed when using this method:
#include <sys/ioctl.h>
#include <net/if.h>
#include <netinet/in.h>
<errno.h>
<string.h>
<stdio.h>
<arpa/inet.h>
Please note that whilst this method can be used with some network protocols other than IPv4, the
Linux implementation does not support IPv6. Furthermore it is only able to return a single result for
any given network protocol, so will only return one of the addresses of an interface that has several.
It is not necessarily portable to other POSIX-compatible systems, and is no longer the preferred
method on Linux.
Invoke ioctl
Once you have the ifreq structure and socket descriptor then you are ready to invoke ioctl:
if (ioctl(fd,SIOCGIFADDR,&ifr)==-1) {
int temp_errno=errno;
close(fd);
die("%s",strerror(temp_errno));
}
close(fd);
If this completes without error then the hardware address of the interface should have been returned
in ifr.ifr_addr in the form of a struct sockaddr_in.
sin_addr
struct sockaddr_in.
struct in_addr.
See also
Get the IP address of a network interface in C using SIOCGIFADDR
Further reading
netdevice(7), Linux manpage
(Note that SIOCGIFADDR was not documented in netdevice(7) until version 3.40 of the Linux manpages project, which was released in April 2012, so at the time of writing it had not been
incorporated into the stable releases of most GNU/Linux distributions. The ioctl itself has been
present in Linux since 1993.)
1 Objective
2 Background
3 Scenario
4 Method
4.1 Overview
4.2 Create an ifreq structure for passing data in and out of ioctl
5 Further reading
Tested on
Debian (Lenny, Squeeze)
Ubuntu (Lucid, Precise,
Trusty)
Objective
To get the index number of a Linux network interface in C using the ioctl command SIOCGIFINDEX
Background
Network interfaces are usually identified by name in user-facing contexts, but for some APIs a
number is used instead. A notable example is the sin6_scope_id field of an IPv6 socket address with
link scope. Indices are also used in some types of netlink message (particularly those concerned
with routing) and in socket addresses for AF_PACKET sockets.
The interface index is typically not the same as the suffix which may form part of the interface
name. For example, on one of the machines tested by the author, eth0 had an index of 2. You
should not assume that they will be the same on other machines, or that they will necessarily
remain the same following a reboot.
Scenario
Suppose you wish to send a raw Ethernet frame using an AF_PACKET socket. To do this you need to
know the index number of the network interface from which the frame is to be sent.
The variable if_name points to a null-terminated string containing the name of the interface.
Method
Overview
On Linux-based systems the index number of a network interface can be obtained using
the ioctl command SIOCGIFINDEX. The method described here has five steps:
1. Create an
ifreq
ioctl.
ioctl.
<errno.h>
<string.h>
<sys/ioctl.h>
<net/if.h>
size_t if_name_len=strlen(if_name);
if (if_name_len<sizeof(ifr.ifr_name)) {
memcpy(ifr.ifr_name,if_name,if_name_len);
ifr.ifr_name[if_name_len]=0;
} else {
die("interface name is too long");
}
Invoke ioctl
Once you have the ifreq structure and socket descriptor then you are ready to invoke ioctl:
if (ioctl(fd,SIOCGIFINDEX,&ifr)==-1) {
die("%s",strerror(errno));
}
If this completes without error then the interface index should have been returned
in ifr.ifr_ifindex.
Further reading
netdevice(7) (Linux manpage)
Content
1 Objective
3 Method
4 Testing
5 Variations
6 Methods to avoid
6.1 Use the daemon function
Tested on
Debian (Etch, Lenny,
Squeeze)
Fedora (14)
Ubuntu (Hardy, Intrepid,
Jaunty, Karmic, Lucid,
Maverick, Natty, Precise,
Trusty)
Objective
To cause a process to become a daemon in C
Method
Fork, allowing the parent process to terminate
Calling fork has three possible types of return value:
-1 indicates failure (most likely due to lack of memory, although it is
possible to run out of other resources such as PIDs).
0 indicates that the child is running, in which case execution should
continue with the next step of the daemonisation process.
Any other value indicates that the parent is running, in which case the
process should terminate by calling _exit.
pid_t pid = fork();
if (pid == -1) {
die("failed to fork while daemonising (errno=%d)",errno);
} else if (pid != 0) {
_exit(0);
}
The SIGHUP handler must remain in place until it has absorbed the SIGHUP that the parent is expected
to send when it terminates. See below if you wish to install a SIGHUP handler for other purposes.
<errno.h>
<signal.h>
<fcntl.h>
<unistd.h>
void daemonise() {
// Fork, allowing the parent process to terminate.
pid_t pid = fork();
if (pid == -1) {
die("failed to fork while daemonising (errno=%d)",errno);
} else if (pid != 0) {
_exit(0);
}
// Start a new session for the daemon.
if (setsid()==-1) {
die("failed to become a session leader while daemonising(errno=%d)",errno);
}
// Fork again, allowing the parent process to terminate.
signal(SIGHUP,SIG_IGN);
pid=fork();
if (pid == -1) {
die("failed to fork while daemonising (errno=%d)",errno);
} else if (pid != 0) {
_exit(0);
}
// Set the current working directory to the root directory.
if (chdir("/") == -1) {
die("failed to change working directory while daemonising (errno=%d)",errno);
}
// Set the user file creation mask to zero.
umask(0);
// Close then reopen standard file descriptors.
close(STDIN_FILENO);
close(STDOUT_FILENO);
close(STDERR_FILENO);
if (open("/dev/null",O_RDONLY) == -1) {
die("failed to reopen stdin while daemonising (errno=%d)",errno);
}
if (open("/dev/null",O_WRONLY) == -1) {
die("failed to reopen stdout while daemonising (errno=%d)",errno);
}
if (open("/dev/null",O_RDWR) == -1) {
die("failed to reopen stderr while daemonising (errno=%d)",errno);
}
}
Testing
See Cause a process to become a daemon.
Variations
Redirect stdout and stderr to a logfile
When directing output to a logfile, it is best to open the file before closing
daemon is not left with no means of reporting errors:
stderr
close(STDIN_FILENO);
if (open("/dev/null",O_RDONLY) == -1) {
die("failed to reopen stdin while daemonising (errno=%d)",errno);
}
int logfile_fileno = open(logfile_pathname,O_RDWR|O_CREAT|O_APPEND,S_IRUSR|S_IWUSR|S_IRGRP);
if (logfile_fileno == -1) {
die("failed to open logfile (errno=%d)",errno);
}
dup2(logfile_fileno,STDOUT_FILENO);
dup2(logfile_fileno,STDERR_FILENO);
close(logfile_fileno);
Note that dup2 will close the target file descriptor if necessary, so there is no need to do this
explicitly.
When installing the signal handler, it is better to use sigaction in preference to the signal function
because that allows the SA_RESTART flag to be used. Without this, it is necessary to place a loop
around any system function that is capable of returning EINTR:
struct sigaction sa;
sa.sa_handler = handle_sighup;
sigemptyset(&sa.sa_mask);
sa.sa_flags = SA_RESTART;
if (sigaction(SIGHUP,&sa,0) == -1) {
die("failed to install SIGHUP handler (errno=%d)",errno);
}
Methods to avoid
Use the daemon function
Many POSIX-based operating systems provide a function called daemon which performs some or all
of the steps listed above. Unfortunately it has three significant drawbacks:
It is not available on all systems.
Its behaviour is not standardised (or necessarily well-documented).
Its behaviour is more difficult to customise.
For these reasons, any benefit gained by using the daemon function is likely to be a short-term one at
best.
1 Objective
2 Scenario
3 Method
4 Alternative
4.1 Using sprintf
Tested on
Ubuntu (Lucid, Precise)
Objective
To pad an integer with leading zeros to a given minimum width when converting it to a character
string in C++
Scenario
Suppose you are writing a program for generating customer invoices. Each customer has an
account number. These are represented internally as integers, but when converted to character
strings for display or printing you want them to be padded to 8 digits using leading zeros.
Method
The method described here uses the C++ iostream library to perform the conversion. It requires an
output stream for the result to be sent to, however a std::ostringstream can be used to capture the
character sequence and present it as a std::string if required. Padding with zeros is achieved by
combining the effect of three standard manipulators:
std::setw,
std::setfill,
std::internal,
to arrange for padding to occur after the sign but before the
remainder of the number.
Used by
<ios>
std::internal
<iomanip>
std::setw, std::setfill
<sstream>
std::ostringstream
If you are using a std::ostringstream that will be discarded immediately after the conversion then
simply write the three manipulators to the stream (in any order) followed by the value to be
converted:
std::string format_account_number(int acct_no) {
ostringstream out;
out << std::internal << std::setfill('0') << std::setw(8) << acct_no;
return out.str();
}
If the stream will be used subsequently for other purposes then you will probably want to reset the
fill character and field adjustment properties, otherwise they will remain in effect for later output. It
is not necessary to do this for the field width, which is automatically reset to zero after each field is
written:
void write_account_number(std::ostream& out, int acct_no) {
out << std::internal << std::setfill('0') << std::setw(8) << acct_no;
out << std::left << std::setfill(' ');
}
The std::internal manipulator can be omitted if the number is unsigned or known to be nonnegative, but it is needed in the general case because because otherwise the padding characters will
be inserted at the far left of the field by default (producing output such as 000000-1 as opposed to
-0000001).
Be aware that because std::setw controls the total width of the field (including the sign if there is
one), with the consequence that negative values will by default be one digit shorter than nonnegative values. If this is a problem then std::showpos can be used to ensure that there is always a
sign (plus or minus), in which case the number of digits remains constant.
Floating point values can be padded in a similar manner. The same applies to character strings,
except that std::internal would be ineffective.
Alternative
Using sprintf
A similar effect can be achieved using std::snprintf from <cstdio>:
std::string format_account_number(int acct_no) {
char buffer[9];
std::snprintf(buffer, sizeof(buffer), "%08d", acct_no);
return buffer;
}
For a typical implementation of the standard library this method is likely to be significantly faster
than using a std::ostringstream (and would be faster still if std::string were avoided too). The
cost is that buffer management and type safety become your responsibility, with undefined
behaviour the likely consequence if you make a mistake.
A minor difference is that std::snprintf will truncate to whatever buffer length you have chosen,
whereas std::ostringstream will not.
1 Objective
2 Background
3 Scenario
4 Method
4.1 Overview
4.4 Link the object code using the -fPIC and -shared options
5 Testing
6 Alternatives
Tested on
Debian (Etch, Lenny,
Squeeze)
Ubuntu (Hardy, Intrepid,
Jaunty, Karmic, Lucid,
Maverick, Natty, Oneiric,
Precise, Quantal)
Objective
To build a shared library using GCC
Background
Programs can be linked against libraries either at compile time or at run time. An advantage of
linking at run time is that a single copy of the library can be shared between many programs, both
on disc and in memory. Libraries suitable for use in this way are known as shared libraries.
On modern Linux-based systems, shared libraries differ from static ones in the following respects:
they are ELF files (as opposed to archives compatible with
the ar program),
they have a dynamic symbol table (in addition to a static table), and
the code within them must be position-independent.
For these reasons, some adjustments to the build process are needed to create a shared library
instead of a static one.
Scenario
Suppose that you are building a library named libqux which is written in C. There are three source
files: foo.c, bar.c and baz.c.
The current version number of libqux is 1.5.0. It is fully backward-compatible with the previous
version, 1.4.1, which had an soname oflibqux.so.1.
Method
Overview
The method described here has three steps:
1. Choose an soname (if required).
-fPIC
-fPIC
and
option.
-shared
options.
-fPIC
option:
This option is not enabled by default because it tends to cause some loss of performance, and for
purposes other than building shared libraries it is often not necessary.
Link the object code using the -fPIC and -shared options
The default behaviour of the gcc and g++ commands when linking is to produce an executable
program. They can be instructed to produce a shared library instead by means of -shared option:
gcc -shared -fPIC -Wl,-soname,libqux.so.1 -o libqux.so.1.5.0 foo.o bar.o baz.o -lc
The -fPIC option is needed when linking as it was when compiling to ensure that any code added
by the linker is compatible with code previously generated by the compiler.
The -Wl option passes a comma-separated list of arguments to the linker. As its name suggests, sonamespecifies the required soname. If these options are omitted then the library will not have an
soname.
The ldconfig manpage recommends explicitly linking against libc, which has been done above
using the-l option (-lc).
Testing
One way to test the library is to install it in a directory on the library search path. /usr/local/lib is
usually the most appropriate choice. You will need to create softlinks corresponding to the soname
of the library, and the name used to refer to the library when building the executable, if these are
different from the filename:
ln -s libqux.so.1.5.0 libqux.so.1
ln -s libqux.so.1.5.0 libqux.so
A partial alternative is to run ldconfig, which automatically creates the first of the above softlinks
but not the second. However you do it, this method of testing normally requires administrative
privileges. Once installed, it should be possible to link against the library using -l:
gcc main.c -lqux
If you cannot or do not want to move the library to /usr/local/lib then it is possible to link against
the library in situ. At build time this can be done by listing the pathname of the library as an
argument to gcc without use of the -l option:
gcc main.c libqux.so.1.5.0
At load time you will need to add the relevant directory to the library search path. This can be done
by setting the environment variableLD_LIBRARY_PATH, for example:
export LD_LIBRARY_PATH=`pwd`
As above, you will need to create a softlink corresponding to the soname of the library. If there is a
need to search multiple directories then they should be specified as a colon-separated list
in LD_LIBRARY_PATH.
Alternatives
Using GNU Libtool
Libtool is part of GNU Autotools. Its purpose is to simplify the process of building shared libraries,
particularly those intended for use on multiple platforms. For example, for the scenario described
above you could use the following sequence of commands:
libtool
libtool
libtool
libtool
--mode=compile gcc
--mode=compile gcc
--mode=compile gcc
--mode=link gcc -o
-c foo.c
-c bar.c
-c baz.c
libqux.la foo.lo bar.lo baz.lo -rpath /usr/local/lib -version-info 6:0:5
You may not need to these commands explicitly, because Libtool is often used in conjunction with
Automake which has the ability to generate them automatically, but it is equally suitable for use as
a stand-alone utility if that suits your purpose.
Be aware that Libtool requires the use of a specific numbering scheme for specifying the interface
version (passed using the -version-infooption above), and that this should almost certainly not be
equal to the release version. The Libtool manual describes when and how these values should be
changed.
Further reading
Program Library HOWTO, David A Wheeler
Libtools versioning system, GNU Libtool Manual, GNU Project
Vaughan et al, Library Versioning, GNU Autoconf, Automake and Libtool
ldconfig(8) (Ubuntu manpage)
1 Objective
2 Scenario
3 Method
3.1 Overview
3.3 Connect the entrance of the pipe to STDOUT_FILENO within the child process
3.4 Close the entrance of the pipe within the parent process
3.5 Close the exit from the pipe within the child process
4 Alternatives
5 See also
6 Further reading
Tested on
Debian (Etch, Lenny,
Squeeze)
Ubuntu (Hardy, Intrepid,
Jaunty, Karmic, Lucid,
Maverick, Natty, Oneiric,
Precise, Quantal)
Objective
To capture the standard output of a child process in C
Scenario
Suppose that you are writing a program which executes a command as a child process
using fork and exec:
pid_t pid = fork();
if (pid == -1) {
perror("fork");
exit(1);
} else if (pid == 0) {
The command is expected to write some text to stdout and you wish to capture this output for use
by the parent process.
Method
Overview
The method described here has four steps:
1. Create a new pipe using the
pipe
function.
STDOUT_FILENO
Used by
<errno.h>
errno, EINTR
<stdio.h>
perror
<stdlib.h>
exit
<unistd.h>
<sys/wait.h>
wait, pid_t
The file descriptor for the entrance to the pipe is written to filedes[1] and the exit to filedes[0].
The former must be transferred to the child process, the latter retained by the parent process. The
simplest way to arrange this is to create the pipe before the child process is forked (thus ensuring
that each process receives a copy of both descriptors).
Connect the entrance of the pipe to STDOUT_FILENO within the child process
When a process forks, the child inherits a set of file descriptors that are copies of those owned by
the parent process. Consequently, if the standard output of the parent process is routed to a
particular terminal device then the same will be true of the child process (in the first instance).
To capture the output of the child process, its standard output must instead be routed into the pipe.
This can be arranged using the dup2command:
while ((dup2(filedes[1], STDOUT_FILENO) == -1) && (errno == EINTR)) {}
The effect is to close the file descriptor STDOUT_FILENO if it was previously open, then (re)open it as
a copy of filedes[1]. A loop is needed to allow for the possibility of dup2 being interrupted by a
signal. Once this has been done, filedes[1] can be closed:
close(filedes[1]);
It would be equally acceptable to copy the descriptor onto STDERR_FILENO in order to capture the
standard error stream. To capture both stdoutand stderr you can either create two separate pipes,
or if it is acceptable for the streams to be mixed, copy the same file descriptor onto
bothSTDOUT_FILENO and STDERR_FILENO by calling dup2 twice.
filedes[1]
should be closed
close(filedes[1]);
Close the exit from the pipe within the child process
Similarly, the child process has no need to access the exit from the pipe:
close(filedes[0]);
(You should also have made arrangements to close any other file descriptors not needed by the
child process, regardless of whether you want to capture its output.)
Sample code
The code for managing the pipe can be integrated into the existing program as follows:
int filedes[2];
if (pipe(filedes) == -1) {
perror("pipe");
exit(1);
}
pid_t pid = fork();
if (pid == -1) {
perror("fork");
exit(1);
} else if (pid == 0) {
while ((dup2(filedes[1], STDOUT_FILENO) == -1) && (errno == EINTR)) {}
close(filedes[1]);
close(filedes[0]);
execl(cmdpath, cmdname, (char*)0);
perror("execl");
_exit(1);
}
close(filedes[1]);
It is then possible for the parent process to read the output of the child process from file
descriptor filedes[0]:
char buffer[4096];
while (1) {
ssize_t count = read(filedes[0], buffer, sizeof(buffer));
if (count == -1) {
if (errno == EINTR) {
continue;
} else {
perror("read");
exit(1);
}
} else if (count == 0) {
break;
} else {
handle_child_process_output(buffer, count);
}
}
close(filedes[0]);
wait(0);
If you need to avoid blocking while waiting for output from the child then this can be arranged
using select, O_NONBLOCK or similar.
Alternatives
Using O_CLOEXEC to close file descriptors
If you want to capture its output then it is quite likely that (as in this example) the child process will
be calling a function from the exec family to transfer control to another program. An alternative
method is then available for closing the pipe exit within the child process, by setting
theO_CLOEXEC flag:
if (fcntl(filedes[0], F_SETFD, FD_CLOEXEC) == -1) {
perror("fcntl");
exit(1);
}
This should be done in the parent process prior to forking. It avoids the need to take any explicit
action within the child process to close the file descriptor, provided that exec is called. This makes
little difference if there is only one file descriptor to close, but when there are many child processes
executing in parallel the benefits are more noticable: one system call is needed instead of many, and
because the flag can be set immediately when the pipe is created there is less risk of file descriptors
being missed.
Using popen
The popen function provides most of the functionality described above in the form of a single
function call:
FILE* fp = popen("pwd", "r");
// ...
int status = pclose(fp);
popen
See also
Reap zombie processes using a SIGCHLD handler
Further reading
pipe, Base Specifications Issue 7, The Open Group, 2008
dup, Base Specifications Issue 7, The Open Group, 2008
Tags: c | posix | process
1 Objective
2 Background
3 Scenario
4 Method
4.1 Overview
5 Alternatives
6 See also
7 Further reading
Tested on
Debian (Etch, Lenny,
Squeeze)
Ubuntu (Hardy, Intrepid,
Jaunty, Karmic, Lucid,
Maverick, Natty, Oneiric,
Precise, Quantal)
Objective
To install a SIGCHLD handler for reaping zombie processes
Background
When a child process terminates it does not disappear entirely. Instead it becomes a zombie
process which is no longer capable of executing, but which still has a PID and an entry in the
process table. This is indicated by the state code Z in ps or top.
The presence of a moderate number of zombie processes is not particularly harmful, but they add
unnecessary clutter that can be confusing to the administrator. In extreme cases they could exhaust
the number of available process table slots. For these reasons, well-behaved programs should
ensure that zombie processes are removed in a timely manner.
The process of eliminating zombie processes is known as reaping. The simplest method is to
call wait, but this will block the parent process if the child has not yet terminated. Alternatives are
to use waitpid to poll or SIGCHLD to reap asynchronously. The method described here uses SIGCHLD.
Scenario
Suppose you have written a network server which spawns a separate child process to handle each
connection. The child process terminates itself when the connection closes, without any
involvement from the parent process. It would be unacceptable for the parent process to block,
therefore calling wait immediately after fork is not an option.
Method
Overview
The method described here has two steps:
1. Define a handler for
2. Register the
SIGCHLD
SIGCHLD
that calls
waitpid.
handler.
Note that the signal is named SIGCHLD with an H, as opposed to SIGCLD (which has a similar function,
but potentially different semantics and is non-portable).
The following header files are used:
Header
Used by
<signal.h>
<stdio.h>
perror
<stdlib.h>
exit
<sys/wait.h>
The reason for calling waitpid as opposed to wait is to allow use of the WNOHANG option, which
prevents the handler from blocking. This allows for the possibility of SIGCHLD being raised for
reasons other than the termination of a child process.
(SIGCHLD has three conventional uses: to indicate that a child process has terminated, stopped or
continued. The latter two conditions can be suppressed using SA_NOCLDSTOP as described below, but
that would not prevent a process with the right permissions from raising
using the kill function or an equivalent.)
SIGCHLD
The reason for placing waitpid within a loop is to allow for the possibility that multiple child
processes could terminate while one is in the process being reaped. Only one instance
of SIGCHLD can be queued, so it may be necessary to reap several zombie processes during one
invocation of the handler function.
The loop ensures that any zombies which existed prior to invocation of the handler function will be
reaped. If any further zombies come into being after that moment in time then they may or may not
be reaped by that invocation of the handler function (depending on the timing), but they should
leave behind a pending SIGCHLD that will result in the handler being called again.
sigaction
function:
You should do this before any child processes terminate, which in practice means registering before
any are spawned. (POSIX neither requires nor prohibits SIGCHLD being raised in respect of a child
that had already terminated when the handler was registered, so a program which relied on this
happening might work but would not be portable.)
When an operating system function is interrupted by a signal the default behaviour is to return
immediately (either with the error EINTR, or reporting partial completion if that is possible). This
creates a need for such functions to be wrapped in a loop for the purpose of handling EINTR, which
is both inconvenient and error-prone. Setting the SA_RESTART flag when the signal is registered
makes this unnecessary in most cases, and is recommended unless you have a good reason not to.
Setting the SA_NOCLDSTOP flag prevents SIGCHLD from being raised when a child process stops or
continues (as opposed to terminating). Since our interest is confined to processes that have
terminated, there no harm in this and it may prevent the handler being invoked unnecessarily. It
does not obviate the need to use WNOHANG within the handler because it does not
prevent SIGCHLD from being raised in some other way.
Alternatives
Explicitly set the SIGCHLD handler to SIG_IGN
If (as in the example above) the signal handler does nothing beyond calling waitpid then an
alternative is available. Setting the SIGCHLD handler to SIG_IGN will cause zombie processes to be
reaped automatically:
struct sigaction sa;
sa.sa_handler = SIG_IGN;
sigemptyset(&sa.sa_mask);
sa.sa_flags = 0;
if (sigaction(SIGCHLD, &sa, 0) == -1) {
perror(0);
exit(1);
}
This can be implemented portably and somewhat more concisely with the
prefer:
signal
function if you
Note that it is not sufficient for SIGCHLD to have a disposition that causes it to be ignored (as the
default, SIG_DFL, would do): it is only by setting it to SIG_IGN that this behaviour is obtained.
One drawback of this method is that it is slightly less portable than explicitly calling waitpid: the
behaviour it depends on is required by POSIX.1-2001, and previously by the Single Unix
Specification, but not by POSIX.1-1990.
There is one small advantage to using SA_NOCLDWAIT: if it is supported at all then you can be
reasonably confident that it will have the desired behaviour, whereas for SIG_IGN this is assured
only if the operating system declares conformance to an appropriate version of POSIX or SUS.
See also
Capture the output of a child process in C
Further reading
wait, waitpid, Base Specifications Issue 7, The Open Group, 2008
<signal.h>, Base Specifications Issue 7, The Open Group, 2008
1 Objective
2 Background
3 Scenario
4 Method
4.1 Overview
5 Testing
6 Variations
6.3 Omitting the conversion between network and host byte order
7 Further reading
Tested on
Debian (Lenny)
Objective
To calculate an Internet Protocol checksum in C
Background
RFC 791 defines the following checksum algorithm for use when constructing the header of an
IPv4 datagram:
The checksum field is the 16 bit one's complement of the one's complement
sum of all 16 bit words in the header. For purposes of computing the
checksum, the value of the checksum field is zero.
The same algorithm is used by a number of other IP-based protocols including TCP, UDP and
ICMP. Implementation techniques are discussed in RFC 1071, RFC 1141 and RFC 1624.
Scenario
Suppose that you wish to send an ICMP echo request using a raw socket. Like all ICMP messages
this contains a checksum that is calculated using the algorithm described above. Given the message
to be sent, you wish to calculate the required checksum.
Method
Overview
The checksum can be calculated using the following algorithm:
1. Set the checksum field to zero.
2. Pad the data to an even number of bytes.
3. Reinterpret the data as a sequence of 16-bit unsigned integers that are in
network byte order.
4. Calculate the sum of the integers, subtracting
reaches 0x10000 or greater.
0xffff
5. Calculate the bitwise complement of the sum. This is the required value
of the checksum field.
Ones complement notation has two representations for the number zero: normal zero ( 0x0000 in
this case) and negative zero (0xffff). It is not completely clear how these should be handled:
RFC 791 states only that ones complement arithmetic should be used,
and does not address the question of how zero is represented.
The data should be passed to the function in network byte order with the checksum field already
zeroed. The result is returned in network byte order, so is ready to be written directly into the
checksum field.
If there is an odd byte at the end of the data then this is treated as a special case so that padding can
be done on the fly. The calls to memcpy are needed to avoid breaking the strict aliasing rules, which
prevent an arbitrary type from being safely cast to a uint16_t.
The maximum length of message that can be processed by this function is limited to approximately
16 gigabytes by the number of deferred carries that can be accumulated. In this unlikely event that
this is insufficient then the upper half of the accumulator can be folded into the lower half as often
as is necessary to prevent an overflow. This is more likely to be required when processing 16-bit
blocks using a 32-bit accumulator, in which case only 128 kilobytes can be processed without the
risk of overflow.
Testing
Here is an example of how an 8-byte ICMP echo request might be constructed using
the icmphdr structure type provided by glibc:
struct icmphdr req;
req.type=8;
req.code=0;
req.checksum=0;
req.un.echo.id=htons(0x1234);
req.un.echo.sequence=htons(1);
req.checksum=ip_checksum(&req,8);
Variations
Verifying a checksum
There are two ways in which checksums of the type described here can be verified:
by calculating what the checksum should be using the normal method,
then comparing this to the value received, or
by calculating the checksum without first zeroing the checksum field,
then comparing this with normal zero (0x0000).
The second method is likely to be simpler, quicker and more convenient in most cases. If you
should decide to use the first method then some care is needed with regard to negative and normal
zero. RFC 1624 recommends that either be accepted (in accordance with the robustness principle:
be conservative in what you send, liberal in what you accept). This can be achieved by normalising
the received checksum before performing the comparison.
(No special action is required when using the first method, provided that the checksum algorithm
used to perform the verification consistently returns normal zero in preference to negative zero. A
minor optimisation would be to omit the final inversion and compare the accumulator with negative
zero.)
This is allowed by C99, but not by C89 or C++. It has the disadvantage that the union must be
constructed by the caller if copying is to be avoided, and this may not always be practicable.
The alternative is to reinterpret the data by means of a type cast. This would not normally be safe in
any variant of C or C++, and would be quite likely to fall foul of the aliasing rules that are specified
by C99. However in some compilation environments it can be made safe (or at least, less unsafe)
by disabling strict application of the aliasing rules. In the case of GCC this is done using the -fnostrict-aliasing option or themay_alias attribute.
It should be noted that the removal of memcpy will not necessarily improve the performance of of the
checksum function because the compiler may already be able to achieve the same result without
assistance. For example, GCC can do this in some cases when optimisation is enabled. It would be
advisable to determine whether there is any benefit to be gained before making non-portable
changes to the source code.
gives 0x0179. This due to the carry from the most significant byte of each block being fed back into
the least significant byte and vice versa. It might therefore appear that the calls
to ntohs and htons made above are redundant. This is almost, but not quite, correct.
The usual behaviour of ntohs is to either do nothing or reverse the byte order. In either of these
cases the calls to ntohs and htons cancel out and could be removed. However POSIX states quite
clearly that an arbitrary rearrangement of the bit pattern could occur, so if you want to be certain
that the algorithm will behave as intended then an explicit conversion to host byte order is
necessary.
Further reading
J. Postel, Internet Protocol - DARPA Internet Program Protocol
Specification, STD 5, RFC 791, DARPA, September 1981
R. Braden, D. Borman and C. Partridge, Computing the Internet
Checksum, RFC 1071, September 1988
T Mallory and A. Kullberg, Incremental Updating of the Internet
Checksum, RFC 1141, January 1990
A. Rijsinghani, Computation of the Internet Checksum via Incremental
Update, RFC 1624, May 1994
1 Objective
2 Scenario
3 Method
3.1 Overview
4 Variations
5 See also
6 Further Reading
Tested on
Debian (Lenny)
Objective
To send an outbound UDP datagram in C
Scenario
Suppose that you wish to write a client that implements the UDP-based variant of the Daytime
Protocol, as defined by RFC 867
This is a very simple protocol whereby the client sends a datagram to the server, then the server
responds with a datagram containing a human-readable copy of the current date and time. The
datagram from the client is not required to have any particular content.
Method
Overview
The method described here has three steps:
1. Construct the remote socket address.
2. Create a UDP socket.
3. Send the datagram.
The following header files will be needed:
#include
#include
#include
#include
#include
#include
<errno.h>
<string.h>
<unistd.h>
<netdb.h>
<sys/socket.h>
<netinet/in.h>
The hints argument contains additional information to help guide the conversion. In this example:
The address family has been left unspecified so that both IPv4 and IPv6
addresses can be returned. In principle you could receive results for other
address families too: you can either treat this as a feature, or filter out
any unwanted results after the call to getaddrinfo.
The socket type has been constrained to SOCK_DGRAM. This allows UDP
but excludes TCP.
socket
function. This
int fd=socket(res->ai_family,res->ai_socktype,res->ai_protocol);
if (fd==-1) {
die("%s",strerror(errno));
}
The fourth argument is for specifying flags which modify the behaviour of sendto, none of which
are needed in this example.
The value returned by sendto is the number of bytes sent, or -1 if there was an error. UDP
datagrams are sent atomically, so unlike when writing to a TCP socket there is no need to wrap the
function call in a loop to handle partially-sent data.
if (sendmsg(fd,&message,0)==-1) {
die("%s",strerror(errno));
}
The purpose of the iovec array is to provide a scatter/gather capability so that the datagram payload
need not be stored in a contiguous region of memory. In this example the entire payload is stored in
a single buffer, therefore only one array element is needed.
The msghdr structure exists to bring the number of arguments to recvmsg and sendmsg down to a
managable number. On entry to sendmsg it specifies where the destination address, the datagram
payload and any ancillary data are stored. In this example no ancillary data has been provided.
If you wish to pass any flags into sendmsg then this cannot be done using msg_flags, which is
ignored on entry. Instead you must pass them using the third argument to sendmsg (which is zero in
this example).
Variations
Sending to the IPv4 broadcast address
By default, attempts to send a datagram to the broadcast address are rejected with an error
(typically EACCES, however it is not obvious from the POSIX specification which error should
occur). This is a safety measure intended to reduce the risk of making unintended broadcasts. It can
be overridden by setting the SO_BROADCAST socket option:
int broadcast=1;
if (setsockopt(fd,SOL_SOCKET,SO_BROADCAST,
&broadcast,sizeof(broadcast))==-1) {
die("%s",strerror(errno));
}
Replying to a datagram
When replying to a UDP datagram the response should normally be sent to the IP address and port
number from which the request originated. This can be arranged by capturing the source address of
the request using recvfrom or recvmsg, then passing it to sendto or sendmsg as the destination
address for the response.
There is also the question of where the response should be sent from. In most cases the best choice
will be from the port and IP address to which the request was directed. This is not a requirement of
the User Datagram Protocol itself, however there are several reasons why it is desirable:
Generic firewalls and NAT gateways normally use both source and
destination port numbers and IP addresses for connection tracking (as
per RFC 2663) so will fail to associate the response with the request if it is
not sent from the appropriate port and IP address.
This is superficially identical to the call that would be made to establish a TCP connection,
however unlike TCP there is no handshake. This has two notable consequences:
Calling connect on a UDP socket does not (by itself) result in any network
activity.
The call to connect will succeed even if the remote machine is
unreachable or nonexistant.
A UDP socket in the connected state will only receive datagrams that originate from the given
remote address. It is therefore feasible to use functions such as read or recv in place of recvfrom.
Similarly the given remote address becomes the default for outgoing datagrams, therefore it is
feasible to use write or send in place of sendto. (Being connected does not, however, prevent you
from sending datagrams to arbitrary destinations using sendto if you so wish.)
See also
Listen for and receive UDP datagrams in C
Establish a TCP connection in C
Send an arbitrary IPv4 datagram using a raw socket in C
Further Reading
W. Richard Stevens et al, Unix Network Programming, Volume 1: The
Sockets Networking API, 3rd edition, Addison-Wesley, 2003
The Open Group, sendto, Base Specifications Issue 6
The Open Group, sendmsg, Base Specifications Issue 6
1 Objective
2 Scenario
3 Method
3.1 Overview
4 Variations
5 See also
6 Further Reading
Tested on
Debian (Lenny)
Ubuntu (Lucid)
Objective
To listen for and receive inbound UDP datagrams in C
Scenario
Suppose that you wish to write a server that implements the UDP-based variant of the Daytime
Protocol, as defined by RFC 867
This is a very simple protocol whereby the client sends a datagram to the server, then the server
responds with a datagram containing a human-readable copy of the current date and time. The
datagram from the client is not required to have any particular content.
Method
Overview
The method described here has four steps:
1. Construct the local socket address.
2. Create the socket.
3. Bind the local address to the socket.
4. Receive and handle datagrams as they arrive.
This is the appropriate procedure when listening for unsolicited datagrams, as in the scenario
described above. See below for how it can be adapted to:
listening for a reply to a datagram that you have sent, or
exchanging many datagrams with a particular remote host.
The following header files will be needed:
#include
#include
#include
#include
#include
#include
<errno.h>
<string.h>
<unistd.h>
<netdb.h>
<sys/socket.h>
<netinet/in.h>
The hints argument contains additional information to help guide the conversion. In this example:
The address family has been left unspecified so that both IPv4 and IPv6
addresses can be returned. In principle you could receive results for other
address families too: you can either treat this as a feature, or filter out
any unwanted results after the call to getaddrinfo.
The socket type has been constrained to
excludes TCP.
SOCK_DGRAM.
The AI_ADDRCONFIG flag has been set so that IPv6 results will only be
returned if the server has an IPv6 address, and similarly for IPv4.
The res argument is used to return a linked list of addrinfo structures containing the address or
addresses that were found. If the network service daemon has the ability to listen on multiple
sockets then it should open one for each address in the list. Otherwise it is considered acceptable to
use the first result and discard the remainder.
The memory occupied by the result list should be released by calling freeaddrinfo once it is no
longer needed, however this cannot be done until after the socket has been created and bound.
The first argument is the socket descriptor. The second and third arguments are the local address
and its length.
If the local address was constructed using getaddrinfo then the memory occupied by the address
list can now be released:
freeaddrinfo(res);
(If the address list has been searched or filtered then take care that it is the head of the list that is
released, not the address that you have chosen to use.)
The recvmsg function explicitly reports truncation by setting the MSG_TRUNC flag in
the msg_flags member of the message header. Alternatively, truncation can be detected when using
any of the available functions by providing a buffer that is one byte longer than the largest payload
that you actually wish to receive, then interpreting a full buffer as a truncated datagram.
The fourth argument is for specifying flags which modify the behaviour of recvfrom, none of which
are needed in this example.
The value returned by recvfrom is the number of bytes received, or -1 if there was an error.
Truncation is detected in this example using the technique described above of providing a slightly
over-sized datagram buffer.
The purpose of the iovec array is to provide a scatter/gather capability so that the datagram payload
need not be stored in a contiguous region of memory. In this example the entire payload is stored in
a single buffer, therefore only one array element is needed.
The msghdr structure exists to bring the number of arguments to recvmsg and sendmsg down to a
managable number. On entry to recvmsg it specifies where the source address, the datagram payload
and any ancillary data should be stored. In this example no ancillary data has been requested,
therefore no provision has been made for receiving any.
The msg_flags field of the msghdr structure is used by recvmsg to return flags to the caller. These
include the MSG_TRUNC flag, which on exit will be set if the datagram was truncated or clear if it was
not. If you wish to pass any flags into recvmsg then this cannot be done using msg_flags, which is
ignored on entry. Instead you must pass them using the third argument to recvmsg (which is zero in
this example).
Variations
Listening for a reply
When listening for a reply to a datagram that you have sent then three of the four steps listed above
may be omitted:
You can (and normally should) listen for the reply using the same socket
from which the request was sent.
The act of sending the request will have bound the socket to an unused
port number. This will have been used as the source of the request, so
should match the destination of the reply. The socket is therefore
correctly bound to receive the reply.
This is superficially identical to the call that would be made to establish a TCP connection,
however unlike TCP there is no handshake. This has two notable consequences:
Calling connect on a UDP socket does not (by itself) result in any network
activity.
The call to connect will succeed even if the remote machine is unreachable
or nonexistant.
A UDP socket in the connected state will only receive datagrams that originate from the given
remote address. It is therefore feasible to use functions such as read or recv in place of recvfrom.
Similarly the given remote address becomes the default for outgoing datagrams, therefore it is
feasible to use write or send in place of sendto. (Being connected does not, however, prevent you
from sending datagrams to arbitrary destinations using sendto if you so wish.)
See also
Send a UDP datagram in C
Listen for and accept TCP connections in C
Further Reading
W. Richard Stevens et al, Unix Network Programming, Volume 1: The
Sockets Networking API, 3rd edition, Addison-Wesley, 2003
The Open Group,
recvfrom,
recvmsg,
1 Objective
2 Scenario
3 Method
3.1 Overview
4 See also
5 Further Reading
Tested on
Debian (Lenny)
Ubuntu (Precise)
Objective
To establish an outbound TCP connection in C
Scenario
Suppose that you wish to write a client that implements the TCP-based variant of the Daytime
Protocol, as defined by RFC 867
This is a very simple protocol whereby the server sends a human-readable copy of the current date
and time then closes the connection. The client is not required to send any data, and anything it
does send is ignored.
Method
Overview
The method described here has three steps:
<errno.h>
<string.h>
<unistd.h>
<netdb.h>
<sys/socket.h>
<netinet/in.h>
hints.ai_protocol=0;
hints.ai_flags=AI_ADDRCONFIG;
struct addrinfo* res=0;
int err=getaddrinfo(hostname,portname,&hints,&res);
if (err!=0) {
die("failed to resolve remote socket address (err=%d)",err);
}
The hints argument contains additional information to help guide the conversion. In this example:
The address family has been left unspecified so that both IPv4 and IPv6
addresses can be returned. In principle you could receive results for other
address families too: you can either treat this as a feature, or filter out
any unwanted results after the call to getaddrinfo.
The socket type has been constrained to SOCK_STREAM. This allows TCP
but excludes UDP.
The protocol has been left unspecified because it is only meaningful in
the context of a specific address family. If the address family had been set
to AF_INET or AF_INET6 then this field could have been set to
IPPROTO_TCP (but it is equally acceptable to leave it set to zero).
The AI_PASSIVE flag has not been set because the result is intended for
use as a remote address. Its absence causes the IP address to default to
the loopback address (as opposed to the wildcard address).
The AI_ADDRCONFIG flag has been set so that IPv6 results will only be
returned if the server has an IPv6 address, and similarly for IPv4.
The res argument is used to return a linked list of addrinfo structures containing the address or
addresses that were found. If multiple records are returned then the recommended behaviour
(from RFC 1123) is to try each address in turn, stopping when a connection is successfully
established. When doing this you may wish to limit the number of addresses tried and/or allow
connection attempts to overlap, in order to prevent the cumulative timeout period from becoming
excessive.
The memory occupied by the result list should be released by calling freeaddrinfo once it is no
longer needed, however this cannot be done until after the socket has been connected.
socket
function.
2. the socket type (SOCK_STREAM in this case, meaning that the socket
should provide reliable transport of an unstructured byte stream), and
3. the protocol (IPROTO_TCP in this case, corresponding to TCP).
A value of 0 for the protocol requests the default for the given address family and socket type,
which for AF_INET or AF_INET6 and SOCK_STREAMwould be IPPROTO_TCP. It is equally acceptable for
the protocol to be deduced in this manner or specified explicitly.
Assuming you previously used getaddrinfo to construct the remote address then the required
values can be obtained from the addrinfostructure:
int fd=socket(res->ai_family,res->ai_socktype,res->ai_protocol);
if (fd==-1) {
die("%s",strerror(errno));
}
The first argument is the socket descriptor. The second and third arguments are the remote socket
address and its length.
By default the connect function blocks until the initial TCP handshake has been completed and the
socket is ready for use, or alternatively, until the connection attempt fails. Some types of connection
failure are reported very quickly, whereas others can only be detected by means of a timeout. In the
latter case connect may block for several minutes.
If the remote address was constructed using getaddrinfo then the memory occupied by the address
list can now be released:
freeaddrinfo(res);
(If the address list has been searched or filtered then take care that it is the head of the list that is
released, not the address that you have chosen to use.)
The socket descriptor is now ready for use. Here is an example of how it might be utilised to
implement a Daytime Protocol client:
char buffer[256];
for (;;) {
ssize_t count=read(fd,buffer,sizeof(buffer));
if (count<0) {
if (errno!=EINTR) die("%s",strerror(errno));
} else if (count==0) {
break;
} else {
write(STDOUT_FILENO,buffer,count);
}
}
close(fd);
See also
Listen for and accept TCP connections in C
Send a UDP datagram in C
Send an arbitrary IPv4 datagram using a raw socket in C
Further Reading
Listen for and accept TCP connections in C, microHOWTO
1 Objective
2 Scenario
3 Method
3.1 Overview
4 Variations
5 See also
6 Further Reading
Tested on
Debian (Lenny)
Ubuntu (Trusty)
Objective
To listen for and accept inbound TCP connections in C
Scenario
Suppose that you wish to write a daemon that implements the TCP-based variant of the Daytime
Protocol, as defined by RFC 867
This is a very simple protocol whereby the server sends a human-readable copy of the current date
and time then closes the connection. Any data that the client might send is ignored.
Method
Overview
The method described here has six steps:
1. Construct the local socket address.
2. Create the server socket.
3. Set the
SO_REUSEADDR
socket option.
<errno.h>
<string.h>
<unistd.h>
<netdb.h>
<sys/socket.h>
<netinet/in.h>
hints.ai_flags=AI_PASSIVE|AI_ADDRCONFIG;
struct addrinfo* res=0;
int err=getaddrinfo(hostname,portname,&hints,&res);
if (err!=0) {
die("failed to resolve local socket address (err=%d)",err);
}
The hints argument contains additional information to help guide the conversion. In this example:
The address family has been left unspecified so that both IPv4 and IPv6
addresses can be returned. In principle you could receive results for other
address families too: you can either treat this as a feature, or filter out
any unwanted results after the call to getaddrinfo.
The socket type has been constrained to
excludes UDP.
SOCK_STREAM.
AF_INET6
socket
function.
2. the socket type (SOCK_STREAM in this case, meaning that the socket should
provide reliable transport of an unstructured byte stream), and
int reuseaddr=1;
if (setsockopt(server_fd,SOL_SOCKET,SO_REUSEADDR,&reuseaddr,sizeof(reuseaddr))==-1) {
die("%s",strerror(errno));
}
See Listen on a TCP port with connections in the TIME-WAIT state for a detailed discussion of
this issue.
The first argument is the socket descriptor. The second and third arguments are the local address
and its length.
If the local address was constructed using getaddrinfo then the memory occupied by the address
list can now be released:
freeaddrinfo(res);
(If the address list has been searched or filtered then take care that it is the head of the list that is
released, not the address that you have chosen to use.)
The first argument is the socket descriptor. The second argument is the backlog of outstanding
connections that the operating system should queue while they are waiting to be accepted by the
server process. It is only a hint: most implementations take some account of the value requested,
but you should not make any assumptions. A value of SOMAXCONN indicates that the maximum
permissible queue length should be selected.
The optimum value for the backlog depends on the nature of the load:
If the value is too low then the server will be poor at handling short-term
bursts of activity. Connections may be rejected even if the average load is
well below what the server can handle.
If the value is too high then the server will perform less well when it is
genuinely overloaded. Under those circumstances, lengthening the queue
merely increases latency without improving capacity.
A backlog of 5 is a popular choice due to its use in many tutorials. For services that receive
connections at a very slow rate this is probably adequate, but it is too low for services that handle
many short-lived connections (such as web servers). In that case the author's advice would be to
make the value configurable, with a default of SOMAXCONN.
if (pid==-1) {
die("failed to create child process (errno=%d)",errno);
} else if (pid==0) {
close(server_fd);
handle_session(session_fd);
close(session_fd);
_exit(0);
} else {
close(session_fd);
}
}
The parent process should close the descriptor for each connected socket once the corresponding
child process has been spawned. There are two reasons for doing this: to prevent the descriptors
from accumulating, and to prevent the connection from being held open by the parent after it has
been closed by the child. Similarly, the child process should close any file or socket descriptors
inherited from the parent that it does not need access to. This will certainly include the descriptor
for the server socket, but you should consider whether there are any others.
Functionality that is specific to the network service is represented here by the
function handle_session. As a simple example, here is an implementation of the Daytime Protocol:
void handle_session(int session_fd) {
time_t now=time(0);
char buffer[80];
size_t length=strftime(buffer,sizeof(buffer),"%a %b %d %T %Y\r\n",localtime(&now));
if (length==0) {
snprintf(buffer,sizeof(buffer),"Error: buffer overflow\r\n");
}
size_t index=0;
while (index<length) {
ssize_t count=write(session_fd,buffer+index,length-index);
if (count<0) {
if (errno==EINTR) continue;
die("failed to write to socket (errno=%d)",errno);
} else {
index+=count;
}
}
}
Variations
Determining the remote address
It is often desirable and sometimes necessary to determine the remote address from which an
inbound connection originated. A common reason for wanting to do this is to keep an log of all
connections. Other possible motivations include access control, or establishing an outbound
connection back to the client.
The address can be obtained at the time when the connection is accepted by supplying a buffer to
place it in. Alternatively, it can be obtained at any time while the connection is open by
calling getpeername.
The supplied buffer must be large enough and sufficiently well-aligned to accept any socket address
that might be returned. If the address family has not been hard-coded then you can use the
type struct sockaddr_storage, which is designed to hold addresses of any type:
struct sockaddr_storage sa;
socklen_t sa_len=sizeof(sa);
int session_fd=accept(server_fd,(struct sockaddr*)&sa,&sa_len);
Alternatively, if the local address was constructed using getaddrinfo then the required size in bytes
can be found in the ai_addrlen member of the relevant addrinfo structure.
If there is a need to convert the address to human-readable form then this is best done using
the getnameinfo function, especially if it is not known whether the address family is IPv4 or IPv6:
char buffer[INET6_ADDRSTRLEN];
int err=getnameinfo((struct sockaddr*)&sa,sa_len,buffer,sizeof(buffer),0,0,NI_NUMERICHOST);
if (err!=0) {
snprintf(buffer,sizeof(buffer),"invalid address");
}
A useful refinement is to convert IPv4-mapped addresses into plain IPv4 addresses prior to
calling getnameinfo:
if (sa.ss_family==AF_INET6) {
struct sockaddr_in6* sa6=(struct sockaddr_in6*)&sa;
if (IN6_IS_ADDR_V4MAPPED(&sa6->sin6_addr)) {
struct sockaddr_in sa4;
memset(&sa4,0,sizeof(sa4));
sa4.sin_family=AF_INET;
sa4.sin_port=sa6->sin6_port;
memcpy(&sa4.sin_addr.s_addr,sa6->sin6_addr.s6_addr+12,4);
memcpy(&sa,&sa4,sizeof(sa4));
sa_len=sizeof(sa4);
}
}
For example, if a IPv4 connection from 192.168.0.1 were received using an IPv6 socket then the
code fragment above would cause the address to be presented as 192.168.0.1 instead of the less
readable ::ffff:192.168.0.1.
See also
Listen on a TCP port with connections in the TIME-WAIT state
Establish a TCP connection in C
Listen for and receive UDP datagrams in C
Further Reading
Listen on a TCP port with connections in the TIME-WAIT state
Convert an IP address to a human-readable string in C
1 Objective
2 Background
3 Scenario
4 Method
5 Notes
6 Methods to avoid
Tested on
Debian (Lenny, Precise)
Objective
To begin listening on a TCP port whilst there are one or more connections to that port in the TIMEWAIT state, without waiting for the TIME-WAIT state to expire.
Background
When a TCP connection is closed then the socket from which the closure was initiated is not
destroyed immediately. Instead it is placed in the TIME-WAIT state, where it is required to remain
for at least twice the maximum segment lifetime (MSL) to allow any stray network packets to
dissipate. During this period it is not permissible for another TCP connection to be established
between the same pair of IP addresses and port numbers.
By itself this would be no great burden, but most implementations go further and (by default) do
not allow a local address to be bound to a socket if there are any existing sockets using the same IP
address and port number (including sockets in the TIME-WAIT state).
The practical effect of this behaviour is that when a network service terminates leaving connections
in the TIME-WAIT state, it may not be possible to restart that service until the TIME-WAIT states
have expired. The error reported when this happens is EADDRINUSE, which glibc renders as Address
already in use.
Note that TIME-WAIT is not the only issue that could result in an EADDRINUSE error. For example,
there could be orphaned child processes that were spawned by the network service but are still
handling connections. Alternatively there could be another process listening to the port, perhaps
because the previously running instance of the network service failed to die. You can check for
these conditions by running the netstatcommand, without the -l option for connected sockets:
netstat -tn
Scenario
Suppose you are writing a daemon that provides a TCP-based network service. Currently the
following sequence of operations is used to open a server socket and listen on the required port:
int fd=socket(AF_INET,SOCK_STREAM,0);
if (fd==-1) {
die("%s",strerror(errno));
}
if (bind(fd,(struct sockaddr*)&addr,sizeof(addr))==-1) {
die("%s",strerror(errno));
}
if (listen(fd,SOMAXCONN)==-1) {
die("%s",strerror(errno));
}
When the network service is restarted it sometimes fails with the error Address already in use.
You wish to prevent this from happening.
Method
The error can be avoided by setting the SO_REUSEADDR socket option after the socket has been
created but before calling bind:
int fd=socket(AF_INET,SOCK_STREAM,0);
if (fd==-1) {
die("%s",strerror(errno));
}
int reuseaddr=1;
if (setsockopt(fd,SOL_SOCKET,SO_REUSEADDR,&reuseaddr,sizeof(reuseaddr))==-1) {
die("%s",strerror(errno));
}
if (bind(fd,(struct sockaddr*)&addr,sizeof(addr))==-1) {
die("%s",strerror(errno));
}
if (listen(fd,SOMAXCONN)==-1) {
die("%s",strerror(errno));
}
allows a local address to be bound to a socket even if that address is already being
used by a connection. This is helpful not only for dealing with connections in the TIME-WAIT
state, but also any ESTABLISHED connections that are being handled by orphaned child processes.
SO_REUSEADDR
It is considered safe for a TCP server socket to reuse a local address, because such sockets are used
only to listen for connections and do not themselves act as endpoints. When new connections arrive
they will need to be checked to ensure that they do not clash with existing ones, but this is
something the network stack should be doing anyway: it makes no difference that the server
process has been restarted.
In the absence of any good reason for leaving SO_REUSEADDR unset, it is considered good practice to
set it as a matter of routine when creating TCP server sockets.
Notes
Depending on the implementation, it may be necessary for SO_REUSEADDR to be set both before and
after the service is restarted.
does not allow two TCP sockets to listen to the same IP address and port number at
the same time.
SO_REUSEADDR
Methods to avoid
Using SO_LINGER
It is possible to prevent the TIME-WAIT state from being entered in the first place by setting
theSO_LINGER option with a timeout of zero. This changes the behaviour of the close function:
instead of performing a graceful shutdown, it aborts the connection by sending an immediate RST.
Any unsent data is discarded and the socket immediately reverts to the CLOSED state.
Whilst this would meet the objective as stated, it is not a desirable solution because it circumvents
the protection against stray network packets provided by the TIME-WAIT state.
Since SO_REUSEADDR achieves the desired effect more safely, there is no justification for
using SO_LINGER to avoid EADDRINUSE errors.
1 Objective
2 Scenario
3 Method
4 Variations
Tested on
Debian (Lenny)
Ubuntu (Precise, Trusty)
Objective
To convert an IPv4 or IPv6 address to a human-readable string (for
example 192.168.0.1 or 2001:db8::1)
Scenario
Suppose you have used the getpeername function to obtain the remote address to which a particular
TCP socket is connected:
struct sockaddr_storage addr;
socklen_t addr_len=sizeof(addr);
int err=getpeername(sock_fd,(struct sockaddr*)&addr,&addr_len);
if (err!=0) {
die("failed to fetch remote address (errno=%d)",errno);
}
The remote address has been written to a buffer called addr. This buffer is of type struct
sockaddr_storage, but the address stored within it will be of type struct
sockaddr_in or sockaddr_in6. The length of the address has been recorded in the variable addr_len.
Note that:
addr
addr_len
sizeof(struct sockaddr_storage)
once the
You wish to convert the IP address contained within addr to a human-readable string.
Method
One way to perform the required conversion is to call the getnameinfo function. By default this
attempts to convert the address into a domain name, however it can be instructed to produce a
numeric address instead by setting the NI_NUMERICHOST flag:
#include <netdb.h>
#include <sys/socket.h>
#include <netinet/in.h>
// ...
char buffer[INET6_ADDRSTRLEN];
int err=getnameinfo((struct sockaddr*)&addr,addr_len,buffer,sizeof(buffer),
0,0,NI_NUMERICHOST);
if (err!=0) {
die("failed to convert address to string (code=%d)",err);
}
printf("Remote address: %s\n",buffer);
The string buffer needs to be at least INET_ADDRSTRLEN bytes long for IPv4 and INET6_ADDRSTRLEN for
IPv6. Since these constants are fixed (by POSIX) at 16 and 46 bytes
respectively, INET6_ADDRSTRLEN can be presumed to suffice for either address family.
Variations
Converting IPv4-mapped IPv6 addresses to plain IPv4
If an IPv4 connection is made to an IPv6 socket then the local and remote network addresses will
be represented as IPv4-mapped addresses. For example, the IPv4 address 192.168.0.1 would be
represented by the IPv6 address ::ffff:192.168.0.1.
This format is readable, but it is probably not the best choice for presentation to the user. Since the
connection was made using IPv4, the user could reasonably expect to see an IPv4 address. This can
be achieved by converting the address from IPv6 to IPv4 before calling getnameinfo:
if (addr.ss_family==AF_INET6) {
struct sockaddr_in6* addr6=(struct sockaddr_in6*)&addr;
if (IN6_IS_ADDR_V4MAPPED(&addr6->sin6_addr)) {
struct sockaddr_in addr4;
memset(&addr4,0,sizeof(addr4));
addr4.sin_family=AF_INET;
addr4.sin_port=addr6->sin6_port;
memcpy(&addr4.sin_addr.s_addr,addr6->sin6_addr.s6_addr+12,sizeof(addr4.sin_addr.s_addr));
memcpy(&addr,&addr4,sizeof(addr4));
addr_len=sizeof(addr4);
}
}
The conversion is performed only if the address family is IPv6, and then only if the address if IPv4mapped. The address buffer must be writable, and of the appropriate size and alignment to hold an
IPv4 or IPv6 socket address. (That is the case here because the buffer is of type struct
sockaddr_storage).
Alternatives
Using inet_ntop
An alternative method is to use the function inet_ntop. This is somewhat easier to use
than getnameinfo if the IP address is not already embedded within a socket address, for example:
#include <arpa/inet.h>
// ...
char buffer[INET4_ADDRSTRLEN];
const char* result=inet_ntop(AF_INET,&ipv4addr,buffer,sizeof(buffer));
if (result==0) {
die("failed to convert address to string (errno=%d)",errno);
}
IPv6 addresses can be handled by specifying AF_INET6 as the first argument, but
(unlike getnameinfo) the result will not include the scope of a link-local or site-local address.
For both IPv4 and IPv6 the address passed in must be in network byte order (most significant byte
first).
Using inet_ntoa
Another alternative is to use the function inet_ntoa. As with inet_ntop, the given IP address need
not be embedded within a socket address:
#include <arpa/inet.h>
// ...
const char* result=inet_ntoa(&ipv4addr);
Notable disadvantages of inet_ntoa are that it is not thread safe and provides no support for IPv6.
However it does pre-date both getnameinfoand inet_ntop, so is more likely to be available on older
systems.
# ifconfig eth0
collisions:0 txqueuelen:1000
3. Disable an Interface
# ifconfig eth0 down
4. Enable an Interface
# ifconfig eth0 up
Or # ifup eth0
Assign ip-address, netmask and broadcast at the same time to interface eht0.
6. Change MTU
This will change the Maximum transmission unit (MTU) to XX. MTU is the maximum number of
octets the interface is able to handle in one transaction. For Ethernet the Maximum transmission unit
by default is 1500.
7. Promiscuous mode
By default when a network card receives a packet, it checks whether the packet belongs to itself. If not,
the interface card normally drops the packet. But in promiscuous mode, the card doesnt drop the
packet. Instead, it will accept all the packets which flows through the network card.
Superuser privilege is required to set an interface in promiscuous mode. Most network monitor tools
use the promiscuous mode to capture the packets and to analyze the network traffic.
[root@tecmint~]#ifconfigeth0:0172.16.25.127
Next, verify the newly created alias network interface address, by using ifconfig eth0:0
command.
[root@tecmint~]#ifconfigeth0:0
eth0:0Linkencap:EthernetHWaddr00:01:6C:99:14:68
inetaddr:172.16.25.123Bcast:172.16.25.63Mask:255.255.255.240
UPBROADCASTRUNNINGMULTICASTMTU:1500Metric:1
Interrupt:17
[root@tecmint~]#ifconfigeth0:0down
[root@tecmint~]#ifconfigeth0hwetherAA:BB:CC:DD:EE:FF
These are the most useful commands for configuring network interfaces in Linux, for
more information and usage of ifconfig command use the manpages like man ifconfig
at the terminal. Check out some other networking utilities below.
You can do this easily by one command. It works on both RedHat and Debian based distributions. Below is an example:
root@db1:~# ifconfig eth1 promisc
DEVICE=ethX
ONBOOT=yes
TYPE=Ethernet
PROMISC=yes
USERCTL=no
Dont forget to replace ethX to the right device you are using.
TCPDUMP INFO
When it comes to tcpdump most admins fall into two categories; they either
know tcpdump and all of its flags like the back of their hand, or they kind of know it but
need to use a reference for anything outside of the basic usage. The reason for this is
because tcpdump is a pretty advanced command and it is pretty easy to get into the
depths of how networking works when using it.
For today's article I wanted to create a quick but practical reference for tcpdump. I will
cover the basics as well as some of the more advanced usage. I am sure I will most likely
leave out some cool commands so if you want to add anything please feel free to drop it
into the comments section.
Before we get too far into the weeds, it is probably best to cover what tcpdump is used
for. The commandtcpdump is used to create "dumps" or "traces" of network traffic. It
allows you to look at what is happening on the network and really can be useful for
troubleshooting many types of issues including issues that aren't due to network
communications. Outside of network issues I use tcpdump to troubleshoot application
issues all the time; if you ever have two applications that don't seem to be working well
together, tcpdump is a great way to see what is happening. This is especially true if the
traffic is not encrypted as tcpdump can be used to capture and read packet data as well.
The Basics
The first thing to cover with tcpdump is what flags to use. In this section I am going to
cover the most basic flags that can be used in most situations.
By default tcpdump will try to lookup and translate hostnames and ports.
# tcpdump
tcpdump: verbose output suppressed, use -v or -vv for full protocol decode
listening on eth0, link-type EN10MB (Ethernet), capture size 65535 bytes
16:15:05.051896 IP blog.ssh > 10.0.3.1.32855: Flags [P.], seq 2546456553:2546456749, ack 1824683693,
win 355, options [nop,nop,TS val 620879437 ecr 620879348], length 196
You can turn this off by using the -n flag. Personally, I always use this flag as the
hostname and port translation usually annoys me because I tend to work from IP
addresses rather than hostnames. However, knowing that you can
have tcpdump translate or not translate these are useful; as there are times where
knowing what server the source traffic is coming from is important.
# tcpdump -n
tcpdump: verbose output suppressed, use -v or -vv for full protocol decode
listening on eth0, link-type EN10MB (Ethernet), capture size 65535 bytes
16:23:47.934665 IP 10.0.3.246.22 > 10.0.3.1.32855: Flags [P.], seq 2546457621:2546457817, ack
1824684201, win 355, options [nop,nop,TS val 621010158 ecr 621010055], length 196
Adding verbosity
# tcpdump -v
By adding a simple -v the output will start including a bit more such as the ttl, total length
and options in an the IP packets.
# tcpdump
tcpdump: verbose output suppressed, use -v or -vv for full protocol decode
listening on eth0, link-type EN10MB (Ethernet), capture size 65535 bytes
16:15:05.051896 IP blog.ssh > 10.0.3.1.32855: Flags [P.], seq 2546456553:2546456749, ack 1824683693,
win 355, options [nop,nop,TS val 620879437 ecr 620879348], length 196
tcpdump has three verbosity levels, you can add more verbosity by adding additional v's
to the command line flags. In general whenever I am using tcpdump I tend to use the
highest verbosity, as I like having everything visible just in case I need it.
# tcpdump -vvv -c 1
tcpdump: listening on eth0, link-type EN10MB (Ethernet), capture size 65535 bytes
16:36:13.873456 IP (tos 0x10, ttl 64, id 121, offset 0, flags [DF], proto TCP (6), length 184)
blog.ssh > 10.0.3.1.32855: Flags [P.], cksum 0x1ba1 (incorrect -> 0x0dfd), seq
2546458841:2546458973, ack 1824684869, win 355, options [nop,nop,TS val 621196643 ecr 621196379],
length 132
Specifying an Interface
# tcpdump -i eth0
By default when you run tcpdump without specifying an interface it will choose the
lowest numbered interface, usually this is eth0 however that is not guaranteed for all
systems.
# tcpdump
tcpdump: verbose output suppressed, use -v or -vv for full protocol decode
listening on eth0, link-type EN10MB (Ethernet), capture size 65535 bytes
16:15:05.051896 IP blog.ssh > 10.0.3.1.32855: Flags [P.], seq 2546456553:2546456749, ack 1824683693,
win 355, options [nop,nop,TS val 620879437 ecr 620879348], length 196
You can specify the interface by using the -i flag followed by the interface name. On most
linux systems a special interface name of any can be used to tell tcpdump to listen on
all interfaces, I find this extremely useful when troubleshooting servers with multiple
interfaces. This is especially true when there are routing issues involved.
# tcpdump -i any
tcpdump: verbose output suppressed, use -v or -vv for full protocol decode
listening on any, link-type LINUX_SLL (Linux cooked), capture size 65535 bytes
16:45:59.312046 IP blog.ssh > 10.0.3.1.32855: Flags [P.], seq 2547763641:2547763837, ack 1824693949,
win 355, options [nop,nop,TS val 621343002 ecr 621342962], length 196
Writing to a file
# tcpdump -w /path/to/file
When you just run tcpdump by itself it will output to your screen.
# tcpdump
tcpdump: verbose output suppressed, use -v or -vv for full protocol decode
listening on eth0, link-type EN10MB (Ethernet), capture size 65535 bytes
16:15:05.051896 IP blog.ssh > 10.0.3.1.32855: Flags [P.], seq 2546456553:2546456749, ack 1824683693,
win 355, options [nop,nop,TS val 620879437 ecr 620879348], length 196
There are many times where you may want to save the tcpdump data to a file, the easiest
way to do this is to use the -w flag. This is useful for situations where you may need to
save the network dump to review later. One benefit to saving the data to a file is that you
can read the dump file multiple times and apply other flags or filters (which we will cover
below) to that snapshot of network traffic.
# tcpdump -w /var/tmp/tcpdata.pcap
tcpdump: listening on eth0, link-type EN10MB (Ethernet), capture size 65535 bytes
1 packet captured
2 packets received by filter
0 packets dropped by kernel
By default the data is buffered and will not usually be written to the file until
you CTRL+C out of the runningtcpdump command.
Once you save the output to a file you will inherently need to read that file. To do this
you can simply use the -r flag followed by the path to the file.
# tcpdump -r /var/tmp/tcpdata.pcap
reading from file /var/tmp/tcpdata.pcap, link-type EN10MB (Ethernet)
16:56:01.610473 IP blog.ssh > 10.0.3.1.32855: Flags [P.], seq 2547766673:2547766805, ack 1824696181,
win 355, options [nop,nop,TS val 621493577 ecr 621493478], length 132
As a quick note, if you are more familiar with tools such as wireshark you can read files
saved by tcpdump with most network troubleshooting tools like wireshark.
By default most newer implementations of tcpdump will capture 65535 bytes, however
in some situations you may not want to capture the default packet length. You can use s to specify the "snaplen" or "snapshot length" that you want tcpdump to capture.
When you run tcpdump by itself it will keep running until you hit CTRL+C to quit.
# tcpdump host google.com
tcpdump: verbose output suppressed, use -v or -vv for full protocol decode
listening on eth0, link-type EN10MB (Ethernet), capture size 65535 bytes
^C
0 packets captured
4 packets received by filter
0 packets dropped by kernel
You can tell tcpdump to stop capturing after a certain number of packets by using the c flag followed by the number of packets to capture. This is pretty useful for situations
where you may not want tcpdump to spew output to your screen so fast you can't read it,
however generally this is more useful when you are using filters to grab specific traffic.
All of the basic flags that were covered above can also be combined to allow you to
specify exactly what you want tcpdump to provide.
Filters
Now that we have covered some of the basic flags we should cover
filtering. tcpdump has the ability to filter the capture or output based on a variety of
expressions, in this article I am only going to cover a few quick examples to give you an
idea of the syntax. For a full list you can checkout the pcap-filter section of
the tcpdumpmanpage.
The above command will run a tcpdump and send the output to the screen like we saw
with the flags before, however it will only do so if the source or destination IP address
is 10.0.3.1. Essentially by adding host 10.0.3.1 we are asking tcpdump to filter out
anything that is not to or from 10.0.3.1.
# tcpdump -nvvv -i any -c 3 host 10.0.3.1
tcpdump: listening on any, link-type LINUX_SLL (Linux cooked), capture size 65535 bytes
17:54:15.067496 IP (tos 0x10, ttl 64, id 5502, offset 0, flags [DF], proto TCP (6), length 184)
10.0.3.246.22 > 10.0.3.1.32855: Flags [P.], cksum 0x1ba1 (incorrect -> 0x9f75), seq
2547785621:2547785753, ack 1824705637, win 355, options [nop,nop,TS val 622366941 ecr 622366923],
length 132
17:54:15.067613 IP (tos 0x10, ttl 64, id 52315, offset 0, flags [DF], proto TCP (6), length 52)
10.0.3.1.32855 > 10.0.3.246.22: Flags [.], cksum 0x1b1d (incorrect -> 0x7c34), seq 1, ack 132, win 540,
options [nop,nop,TS val 622366941 ecr 622366941], length 0
17:54:15.075230 IP (tos 0x10, ttl 64, id 5503, offset 0, flags [DF], proto TCP (6), length 648)
10.0.3.246.22 > 10.0.3.1.32855: Flags [P.], cksum 0x1d71 (incorrect -> 0x3443), seq 132:728, ack 1,
win 355, options [nop,nop,TS val 622366943 ecr 622366941], length 596
Where the previous example showed traffic to and from 10.0.3.1 the above command
will only show traffic where the source of the packet is 10.0.3.1. This is accomplished
by adding src in front of the host filter. This is an additional filter that tells tcpdump to
look for a specific "source". This can be reversed by using the dstfilter, which specifies
the "destination".
# tcpdump -nvvv -i any -c 3 src host 10.0.3.1
tcpdump: listening on any, link-type LINUX_SLL (Linux cooked), capture size 65535 bytes
17:57:12.194902 IP (tos 0x10, ttl 64, id 52357, offset 0, flags [DF], proto TCP (6), length 52)
10.0.3.1.32855 > 10.0.3.246.22: Flags [.], cksum 0x1b1d (incorrect -> 0x1707), seq 1824706545, ack
2547787717, win 540, options [nop,nop,TS val 622411223 ecr 622411223], length 0
17:57:12.196288 IP (tos 0x10, ttl 64, id 52358, offset 0, flags [DF], proto TCP (6), length 52)
10.0.3.1.32855 > 10.0.3.246.22: Flags [.], cksum 0x1b1d (incorrect -> 0x15c5), seq 0, ack 325, win 538,
options [nop,nop,TS val 622411223 ecr 622411223], length 0
17:57:12.197677 IP (tos 0x10, ttl 64, id 52359, offset 0, flags [DF], proto TCP (6), length 52)
10.0.3.1.32855 > 10.0.3.246.22: Flags [.], cksum 0x1b1d (incorrect -> 0x1491), seq 0, ack 633, win
536, options [nop,nop,TS val 622411224 ecr 622411224], length 0
# tcpdump -nvvv -i any -c 3 dst host 10.0.3.1
tcpdump: listening on any, link-type LINUX_SLL (Linux cooked), capture size 65535 bytes
17:59:37.266838 IP (tos 0x10, ttl 64, id 5552, offset 0, flags [DF], proto TCP (6), length 184)
10.0.3.246.22 > 10.0.3.1.32855: Flags [P.], cksum 0x1ba1 (incorrect -> 0x586d), seq
2547789725:2547789857, ack 1824707577, win 355, options [nop,nop,TS val 622447491 ecr 622447471],
length 132
17:59:37.267850 IP (tos 0x10, ttl 64, id 5553, offset 0, flags [DF], proto TCP (6), length 392)
10.0.3.246.22 > 10.0.3.1.32855: Flags [P.], cksum 0x1c71 (incorrect -> 0x462e), seq 132:472, ack 1,
win 355, options [nop,nop,TS val 622447491 ecr 622447491], length 340
17:59:37.268606 IP (tos 0x10, ttl 64, id 5554, offset 0, flags [DF], proto TCP (6), length 360)
10.0.3.246.22 > 10.0.3.1.32855: Flags [P.], cksum 0x1c51 (incorrect -> 0xf469), seq 472:780, ack 1, win
355, options [nop,nop,TS val 622447491 ecr 622447491], length 308
You can add some rather complicated filtering statements with tcpdump when you start
to using operators likeand. You can think of this as something similar to if statements. In
this example we are using the and operator to tell tcpdump to only output packets that
have both ports 22 and 60738. This allows us to narrow down the packets to a specific
session, this can be extremely useful when troubleshooting network issues.
# tcpdump -nvvv -i any -c 3 port 22 and port 60738
tcpdump: listening on any, link-type LINUX_SLL (Linux cooked), capture size 65535 bytes
18:05:54.069403 IP (tos 0x10, ttl 64, id 64401, offset 0, flags [DF], proto TCP (6), length 104)
10.0.3.1.60738 > 10.0.3.246.22: Flags [P.], cksum 0x1b51 (incorrect -> 0x5b3c), seq
917414532:917414584, ack 1550997318, win 353, options [nop,nop,TS val 622541691 ecr 622538903],
length 52
18:05:54.072963 IP (tos 0x10, ttl 64, id 13601, offset 0, flags [DF], proto TCP (6), length 184)
10.0.3.246.22 > 10.0.3.1.60738: Flags [P.], cksum 0x1ba1 (incorrect -> 0xb0b1), seq 1:133, ack 52, win
355, options [nop,nop,TS val 622541692 ecr 622541691], length 132
18:05:54.073080 IP (tos 0x10, ttl 64, id 64402, offset 0, flags [DF], proto TCP (6), length 52)
10.0.3.1.60738 > 10.0.3.246.22: Flags [.], cksum 0x1b1d (incorrect -> 0x1e3b), seq 52, ack 133, win
353, options [nop,nop,TS val 622541692 ecr 622541692], length 0
You can express the and operator in a couple of different ways, you can use and or &&.
Personally, I tend to use them both; it is important to remember that if you are going to
use && that you should enclose the filter expression with single or double quotes. In
BASH you can use && to run one command and if successful run a second. In general it
is best to simply wrap filter expressions in quotes; this will prevent any unexpected
results as filters can have quite a few special characters.
You can also use the or or || operator to filter tcpdump results. In this example we are
using the or operator to capture traffic to and from port 80 or port 443. This example is
especially useful as webservers generally have two ports open, 80 for http traffic
and 443 for https.
# tcpdump -nvvv -i any -c 20 'port 80 or port 443'
tcpdump: listening on any, link-type LINUX_SLL (Linux cooked), capture size 65535 bytes
18:24:28.817940 IP (tos 0x0, ttl 64, id 39930, offset 0, flags [DF], proto TCP (6), length 60)
10.0.3.1.50524 > 10.0.3.246.443: Flags [S], cksum 0x1b25 (incorrect -> 0x8611), seq 3836995553, win
29200, options [mss 1460,sackOK,TS val 622820379 ecr 0,nop,wscale 7], length 0
18:24:28.818052 IP (tos 0x0, ttl 64, id 0, offset 0, flags [DF], proto TCP (6), length 40)
10.0.3.246.443 > 10.0.3.1.50524: Flags [R.], cksum 0x012c (correct), seq 0, ack 3836995554, win 0,
length 0
18:24:32.721330 IP (tos 0x0, ttl 64, id 48510, offset 0, flags [DF], proto TCP (6), length 475)
10.0.3.1.60374 > 10.0.3.246.80: Flags [P.], cksum 0x1cc4 (incorrect -> 0x3a4e), seq
580573019:580573442, ack 1982754038, win 237, options [nop,nop,TS val 622821354 ecr 622815632],
length 423
18:24:32.721465 IP (tos 0x0, ttl 64, id 1266, offset 0, flags [DF], proto TCP (6), length 52)
10.0.3.246.80 > 10.0.3.1.60374: Flags [.], cksum 0x1b1d (incorrect -> 0x45d7), seq 1, ack 423, win
243, options [nop,nop,TS val 622821355 ecr 622821354], length 0
18:24:32.722098 IP (tos 0x0, ttl 64, id 1267, offset 0, flags [DF], proto TCP (6), length 241)
10.0.3.246.80 > 10.0.3.1.60374: Flags [P.], cksum 0x1bda (incorrect -> 0x855c), seq 1:190, ack 423,
win 243, options [nop,nop,TS val 622821355 ecr 622821354], length 189
18:24:32.722232 IP (tos 0x0, ttl 64, id 48511, offset 0, flags [DF], proto TCP (6), length 52)
10.0.3.1.60374 > 10.0.3.246.80: Flags [.], cksum 0x1b1d (incorrect -> 0x4517), seq 423, ack 190, win
245, options [nop,nop,TS val 622821355 ecr 622821355], length 0
Searching for traffic on two specific ports and from a specific host
# tcpdump -nvvv -i any -c 20 '(port 80 or port 443) and host 10.0.3.169'
While the previous example is great for looking at issues for a multiport protocol; what if
this is a very high traffic webserver? The output from tcpdump may get a bit confusing.
We can narrow down the results even further by adding a host filter. To do this while
maintaining our or expression we can simply wrap the orstatement in parenthesis.
# tcpdump -nvvv -i any -c 20 '(port 80 or port 443) and host 10.0.3.169'
tcpdump: listening on any, link-type LINUX_SLL (Linux cooked), capture size 65535 bytes
18:38:05.551194 IP (tos 0x0, ttl 64, id 63169, offset 0, flags [DF], proto TCP (6), length 60)
10.0.3.169.33786 > 10.0.3.246.443: Flags [S], cksum 0x1bcd (incorrect -> 0x0d96), seq 4173164403,
win 29200, options [mss 1460,sackOK,TS val 623024562 ecr 0,nop,wscale 7], length 0
18:38:05.551310 IP (tos 0x0, ttl 64, id 0, offset 0, flags [DF], proto TCP (6), length 40)
10.0.3.246.443 > 10.0.3.169.33786: Flags [R.], cksum 0xa64a (correct), seq 0, ack 4173164404, win 0,
length 0
18:38:05.717130 IP (tos 0x0, ttl 64, id 51574, offset 0, flags [DF], proto TCP (6), length 60)
10.0.3.169.35629 > 10.0.3.246.80: Flags [S], cksum 0x1bcd (incorrect -> 0xdf7c), seq 1068257453, win
29200, options [mss 1460,sackOK,TS val 623024603 ecr 0,nop,wscale 7], length 0
18:38:05.717255 IP (tos 0x0, ttl 64, id 0, offset 0, flags [DF], proto TCP (6), length 60)
10.0.3.246.80 > 10.0.3.169.35629: Flags [S.], cksum 0x1bcd (incorrect -> 0xed80), seq 2992472447,
ack 1068257454, win 28960, options [mss 1460,sackOK,TS val 623024603 ecr 623024603,nop,wscale 7],
length 0
18:38:05.717474 IP (tos 0x0, ttl 64, id 51575, offset 0, flags [DF], proto TCP (6), length 52)
10.0.3.169.35629 > 10.0.3.246.80: Flags [.], cksum 0x1bc5 (incorrect -> 0x8c87), seq 1, ack 1, win 229,
options [nop,nop,TS val 623024604 ecr 623024603], length 0
You can use the parenthesis multiple times in a single filter, for example the below
command will filter the capture to only packets that are to or from port 80 or
port 443 and from hosts 10.0.3.169 and 10.0.3.1 if they are destined for 10.0.3.246.
# tcpdump -nvvv -i any -c 20 '((port 80 or port 443) and (host 10.0.3.169 or host 10.0.3.1)) and
dst host 10.0.3.246'
tcpdump: listening on any, link-type LINUX_SLL (Linux cooked), capture size 65535 bytes
18:53:30.349306 IP (tos 0x0, ttl 64, id 52641, offset 0, flags [DF], proto TCP (6), length 60)
10.0.3.1.35407 > 10.0.3.246.80: Flags [S], cksum 0x1b25 (incorrect -> 0x4890), seq 3026316656, win
29200, options [mss 1460,sackOK,TS val 623255761 ecr 0,nop,wscale 7], length 0
18:53:30.349558 IP (tos 0x0, ttl 64, id 52642, offset 0, flags [DF], proto TCP (6), length 52)
10.0.3.1.35407 > 10.0.3.246.80: Flags [.], cksum 0x1b1d (incorrect -> 0x3454), seq 3026316657, ack
3657995297, win 229, options [nop,nop,TS val 623255762 ecr 623255762], length 0
18:53:30.354056 IP (tos 0x0, ttl 64, id 52643, offset 0, flags [DF], proto TCP (6), length 475)
10.0.3.1.35407 > 10.0.3.246.80: Flags [P.], cksum 0x1cc4 (incorrect -> 0x10c2), seq 0:423, ack 1, win
229, options [nop,nop,TS val 623255763 ecr 623255762], length 423
18:53:30.354682 IP (tos 0x0, ttl 64, id 52644, offset 0, flags [DF], proto TCP (6), length 52)
10.0.3.1.35407 > 10.0.3.246.80: Flags [.], cksum 0x1b1d (incorrect -> 0x31e6), seq 423, ack 190, win
237, options [nop,nop,TS val 623255763 ecr 623255763], length 0
Given the above output we can see that the source ip is 10.0.3.246 the source port
is 56894 and the destination ip is 192.168.0.92 with a destination port of 22. This is
pretty easy to identify once you understand the format of tcpdump. If you haven't
guessed the format yet you can break it down as follows src-ip.src-port > dest-ip.destport: Flags[S] the source is in front of the > and the destination is behind. You can think
of the > as an arrow pointing to the destination.
From the sample above we can tell that the packet is a single SYN packet. We can identify
this by the Flags [S]section of the tcpdump output, different types of packets have
different types of flags. Without going too deep into what types of packets exist within
TCP you can use the below as a cheat sheet for identifying packet types.
[S] - SYN (Start Connection)
[.] - No Flag Set
[P] - PSH (Push Data)
[F] - FIN (Finish Connection)
[R] - RST (Reset Connection)
Depending on the version and output of tcpdump you may also see flags such as [S.] this
is used to indicate aSYN-ACK packet.
An unhealthy example
15:15:43.323412 IP (tos 0x0, ttl 64, id 51051, offset 0, flags [DF], proto TCP (6), length 60)
10.0.3.246.56894 > 192.168.0.92.22: Flags [S], cksum 0xcf28 (incorrect -> 0x0388), seq 682725222,
win 29200, options [mss 1460,sackOK,TS val 619989005 ecr 0,nop,wscale 7], length 0
15:15:44.321444 IP (tos 0x0, ttl 64, id 51052, offset 0, flags [DF], proto TCP (6), length 60)
10.0.3.246.56894 > 192.168.0.92.22: Flags [S], cksum 0xcf28 (incorrect -> 0x028e), seq 682725222,
win 29200, options [mss 1460,sackOK,TS val 619989255 ecr 0,nop,wscale 7], length 0
15:15:46.321610 IP (tos 0x0, ttl 64, id 51053, offset 0, flags [DF], proto TCP (6), length 60)
10.0.3.246.56894 > 192.168.0.92.22: Flags [S], cksum 0xcf28 (incorrect -> 0x009a), seq 682725222,
win 29200, options [mss 1460,sackOK,TS val 619989755 ecr 0,nop,wscale 7], length 0
A healthy example
15:18:25.716453 IP (tos 0x10, ttl 64, id 53344, offset 0, flags [DF], proto TCP (6), length 60)
10.0.3.246.34908 > 192.168.0.110.22: Flags [S], cksum 0xcf3a (incorrect -> 0xc838), seq 1943877315,
win 29200, options [mss 1460,sackOK,TS val 620029603 ecr 0,nop,wscale 7], length 0
15:18:25.716777 IP (tos 0x0, ttl 63, id 0, offset 0, flags [DF], proto TCP (6), length 60)
192.168.0.110.22 > 10.0.3.246.34908: Flags [S.], cksum 0x594a (correct), seq 4001145915, ack
1943877316, win 5792, options [mss 1460,sackOK,TS val 18495104 ecr 620029603,nop,wscale 2], length
0
15:18:25.716899 IP (tos 0x10, ttl 64, id 53345, offset 0, flags [DF], proto TCP (6), length 52)
10.0.3.246.34908 > 192.168.0.110.22: Flags [.], cksum 0xcf32 (incorrect -> 0x9dcc), ack 1, win 229,
options [nop,nop,TS val 620029603 ecr 18495104], length 0
A healthy example would look like the above, in the above we can see a standard TCP 3way handshake. The first packet above is a SYN packet from host 10.0.3.246 to
host 192.168.0.110, the second packet is a SYN-ACKfrom
host 192.168.0.110 acknowledging the SYN. The final packet is a ACK or rather a SYNACK-ACK from host10.0.3.246 acknowledging that it has received the SYN-ACK. From
this point on there is an established TCP/IP connection.
Packet Inspection
Printing packet data in Hex and ASCII
# tcpdump -nvvv -i any -c 1 -XX 'port 80 and host 10.0.3.1'
0x0010: 4500 01e3 d429 4000 4006 49f5 0a00 0301 E....)@[email protected].....
0x0020: 0a00 03f6 b2a4 0050 e9a8 e3e1 39ce d0a4 .......P....9...
0x0030: 8018 00f5 1ccc 0000 0101 080a 2533 58f3 ............%3X.
0x0040: 2533 4656 4745 5420 2f73 6f6d 6570 6167 %3FVGET./somepag
0x0050: 6520 4854 5450 2f31 2e31 0d0a 486f 7374 e.HTTP/1.1..Host
0x0060: 3a20 3130 2e30 2e33 2e32 3436 0d0a 436f :.10.0.3.246..Co
0x0070: 6e6e 6563 7469 6f6e 3a20 6b65 6570 2d61 nnection:.keep-a
0x0080: 6c69 7665 0d0a 4361 6368 652d 436f 6e74 live..Cache-Cont
0x0090: 726f 6c3a 206d 6178 2d61 6765 3d30 0d0a rol:.max-age=0..
0x00a0: 4163 6365 7074 3a20 7465 7874 2f68 746d Accept:.text/htm
0x00b0: 6c2c 6170 706c 6963 6174 696f 6e2f 7868 l,application/xh
0x00c0: 746d 6c2b 786d 6c2c 6170 706c 6963 6174 tml+xml,applicat
0x00d0: 696f 6e2f 786d 6c3b 713d 302e 392c 696d ion/xml;q=0.9,im
0x00e0: 6167 652f 7765 6270 2c2a 2f2a 3b71 3d30 age/webp,*/*;q=0
0x00f0: 2e38 0d0a 5573 6572 2d41 6765 6e74 3a20 .8..User-Agent:.
0x0100: 4d6f 7a69 6c6c 612f 352e 3020 284d 6163 Mozilla/5.0.(Mac
0x0110: 696e 746f 7368 3b20 496e 7465 6c20 4d61 intosh;.Intel.Ma
0x0120: 6320 4f53 2058 2031 305f 395f 3529 2041 c.OS.X.10_9_5).A
0x0130: 7070 6c65 5765 624b 6974 2f35 3337 2e33 ppleWebKit/537.3
0x0140: 3620 284b 4854 4d4c 2c20 6c69 6b65 2047 6.(KHTML,.like.G
0x0150: 6563 6b6f 2920 4368 726f 6d65 2f33 382e ecko).Chrome/38.
0x0160: 302e 3231 3235 2e31 3031 2053 6166 6172 0.2125.101.Safar
0x0170: 692f 3533 372e 3336 0d0a 4163 6365 7074 i/537.36..Accept
0x0180: 2d45 6e63 6f64 696e 673a 2067 7a69 702c -Encoding:.gzip,
0x0190: 6465 666c 6174 652c 7364 6368 0d0a 4163 deflate,sdch..Ac
0x01a0: 6365 7074 2d4c 616e 6775 6167 653a 2065 cept-Language:.e
0x01b0: 6e2d 5553 2c65 6e3b 713d 302e 380d 0a49 n-US,en;q=0.8..I
0x01c0: 662d 4d6f 6469 6669 6564 2d53 696e 6365 f-Modified-Since
0x01d0: 3a20 5375 6e2c 2031 3220 4f63 7420 3230 :.Sun,.12.Oct.20
0x01e0: 3134 2031 393a 3430 3a32 3020 474d 540d 14.19:40:20.GMT.
0x01f0: 0a0d 0a
...
I tend to prefer to print only the ASCII data, this helps me to quickly identify what is
being sent and what is correct or not correct about the packets data. To print packet data
in only the ascii format you can use the -Aflag.
# tcpdump -nvvv -i any -c 1 -A 'port 80 and host 10.0.3.1'
tcpdump: listening on any, link-type LINUX_SLL (Linux cooked), capture size 65535 bytes
19:59:52.011337 IP (tos 0x0, ttl 64, id 53757, offset 0, flags [DF], proto TCP (6), length 406)
10.0.3.1.46172 > 10.0.3.246.80: Flags [P.], cksum 0x1c7f (incorrect -> 0xead1), seq
1552520173:1552520527, ack 428165415, win 237, options [nop,nop,TS val 624251177 ecr 624247749],
length 354
E.....@[email protected]
...
....\.P\.....I'...........
%5Q)%5C.GET /newpage HTTP/1.1
Host: 10.0.3.246
Connection: keep-alive
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8
User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko)
Chrome/38.0.2125.101 Safari/537.36
Accept-Encoding: gzip,deflate,sdch
Accept-Language: en-US,en;q=0.8
As you can see from the output above we have successfully captured an http GET request.
Being able to print the packet data in a human readable format is very useful when
troubleshooting application issues where the traffic is not encrypted. If you are
troubleshooting encrypted traffic then printing packet data is not very useful. However, if
you use have the certificates in use you could use commands such as ssldump or
even wireshark.
Non-TCP Traffic
While the majority of this article covered TCP based traffic tcpdump can capture much
more than TCP. It can also be used to capture ICMP, UDP, and ARP packets to name a
few. Below are a few quick examples of non-TCP packets captured by tcpdump.
ICMP packets
# tcpdump -nvvv -i any -c 2 icmp
tcpdump: listening on any, link-type LINUX_SLL (Linux cooked), capture size 65535 bytes
20:11:24.627824 IP (tos 0x0, ttl 64, id 0, offset 0, flags [DF], proto ICMP (1), length 84)
10.0.3.169 > 10.0.3.246: ICMP echo request, id 15683, seq 1, length 64
20:11:24.627926 IP (tos 0x0, ttl 64, id 31312, offset 0, flags [none], proto ICMP (1), length 84)
10.0.3.246 > 10.0.3.169: ICMP echo reply, id 15683, seq 1, length 64
UDP packets
# tcpdump -nvvv -i any -c 2 udp
tcpdump: listening on any, link-type LINUX_SLL (Linux cooked), capture size 65535 bytes
20:12:41.726355 IP (tos 0xc0, ttl 64, id 0, offset 0, flags [DF], proto UDP (17), length 76)
10.0.3.246.123 > 198.55.111.50.123: [bad udp cksum 0x43a9 -> 0x7043!] NTPv4, length 48
Client, Leap indicator: clock unsynchronized (192), Stratum 2 (secondary reference), poll 6 (64s),
precision -22
Root Delay: 0.085678, Root dispersion: 57.141830, Reference-ID: 199.102.46.75
Reference Timestamp: 3622133515.811991035 (2014/10/12 20:11:55)
Originator Timestamp: 3622133553.828614115 (2014/10/12 20:12:33)
Receive Timestamp:
started to evolving it on their own. Of course the BSD socket implementation was evolved as well at the
same time and thus systems that copied it later got features that were lacking in systems that copied it
earlier. Understanding the BSD socket implementation is the key to understanding all other socket
implementations, so you should read about it even if you don't care to ever write code for a BSD system.
There are a couple of basics you should know before we look at these two options. A TCP/UDP
connection is identified by a tuple of five values:
{<protocol>, <src addr>, <src port>, <dest addr>, <dest port>}
Any unique combination of these values identifies a connection. As a result, no two connections can have
the same five values, otherwise the system would not be able to distinguish these connections any longer.
The protocol of a socket is set when a socket is created with the socket() function. The source address
and port are set with the bind() function. The destination address and port are set with
the connect() function. Since UDP is a connectionless protocol, UDP sockets can be used without
connecting them. Yet it is allowed to connect them and in some cases very advantageous for your code
and general application design. In connectionless mode, UDP sockets that were not explicitly bound when
data is sent over them for the first time are usually automatically bound by the system, as an unbound
UDP socket cannot receive any (reply) data. Same is true for an unbound TCP socket, it is automatically
bound before it will be connected.
If you explicitly bind a socket, it is possible to bind it to port 0, which means "any port". Since a socket
cannot really be bound to all existing ports, the system will have to choose a specific port itself in that case
(usually from a predefined, OS specific range of source ports). A similar wildcard exists for the source
address, which can be "any address" (0.0.0.0 in case of IPv4 and :: in case of IPv6). Unlike in case of
ports, a socket can really be bound to "any address" which means "all source IP addresses of all local
interfaces". If the socket is connected later on, the system has to choose a specific source IP address,
since a socket cannot be connected and at the same time be bound to any local IP address. Depending
on the destination address and the content of the routing table, the system will pick an appropriate source
address and replace the "any" binding with a binding to the chosen source IP address.
By default, no two sockets can be bound to the same combination of source address and source port. As
long as the source port is different, the source address is actually irrelevant.
Binding socketA to A:X and socketB to B:Y, where A and B are addresses and X and Y are ports, is always
possible as long as X != Y holds true. However, even if X == Y, the binding is still possible as long as A !=
B holds true. E.g. socketA belongs to a FTP server program and is bound
to 192.168.0.1:21 and socketB belongs to another FTP server program and is bound to 10.0.0.1:21, both
bindings will succeed. Keep in mind, though, that a socket may be locally bound to "any address". If a
socket is bound to 0.0.0.0:21, it is bound to all existing local addresses at the same time and in that case
no other socket can be bound to port 21, regardless which specific IP address it tries to bind to,
as 0.0.0.0 conflicts with all existing local IP addresses.
Anything said so far is pretty much equal for all major operating system. Things start to get OS specific
when address reuse comes into play. We start with BSD, since as I said above, it is the mother of all
socket implementations.
socketA
socketB
Result
--------------------------------------------------------------------ON/OFF
192.168.0.1:21 192.168.0.1:21
ON/OFF
192.168.0.1:21
ON/OFF
OFF
OFF
ON
ON
ON/OFF
Error (EADDRINUSE)
10.0.0.1:21
OK
10.0.0.1:21 192.168.0.1:21
OK
0.0.0.0:21 192.168.1.0:21
192.168.1.0:21
0.0.0.0:21
0.0.0.0:21 192.168.1.0:21
Error (EADDRINUSE)
Error (EADDRINUSE)
OK
192.168.1.0:21
0.0.0.0:21
OK
0.0.0.0:21
0.0.0.0:21
Error (EADDRINUSE)
The table above assumes that socketA has already been successfully bound to the address given
for socketA, then socketB is created, either gets SO_REUSEADDR set or not, and finally is bound to the
address given for socketB. Result is the result of the bind operation for socketB. If the first column
says ON/OFF, the value of SO_REUSEADDR is irrelevant to the result.
Okay, SO_REUSEADDR has an effect on wildcard addresses, good to know. Yet that isn't it's only effect it
has. There is another well known effect which is also the reason why most people use SO_REUSEADDR in
server programs in the first place. For the other important use of this option we have to take a deeper look
on how the TCP protocol works.
A socket has a send buffer and if a call to the send() function succeeds, it does not mean that the
requested data has actually really been sent out, it only means the data has been added to the send
buffer. For UDP sockets, the data is usually sent pretty soon, if not immediately, but for TCP sockets, there
can be a relatively long delay between adding data to the send buffer and having the TCP implementation
really send that data. As a result, when you close a TCP socket, there may still be pending data in the
send buffer, which has not been sent yet but your code considers it as sent, since the send() call
succeeded. If the TCP implementation was closing the socket immediately on your request, all of this data
would be lost and your code wouldn't even know about that. TCP is said to be a reliable protocol and
losing data just like that is not very reliable. That's why a socket that still has data to send will go into a
state called TIME_WAIT when you close it. In that state it will wait until all pending data has been
successfully sent or until a timeout is hit, in which case the socket is closed forcefully.
The amount of time the kernel will wait before it closes the socket, regardless if it still has pending send
data or not, is called the Linger Time. The Linger Time is globally configurable on most systems and by
default rather long (two minutes is a common value you will find on many systems). It is also configurable
per socket using the socket option SO_LINGER which can be used to make the timeout shorter or longer,
and even to disable it completely. Disabling it completely is a very bad idea, though, since closing a TCP
socket gracefully is a slightly complex process and involves sending forth and back a couple of packets
(as well as resending those packets in case they got lost) and this whole close process is also limited by
the Linger Time. If you disable lingering, your socket may not only lose pending data, it is also always
closed forcefully instead of gracefully, which is usually not recommended. The details about how a TCP
connection is closed gracefully are beyond the scope of this answer, if you want to learn more about, I
recommend you have a look at this page. And even if you disabled lingering with SO_LINGER, if your
process dies without explicitly closing the socket, BSD (and possibly other systems) will linger
nonetheless, ignoring what you have configured. This will happen for example if your code just
calls exit() (pretty common for tiny, simple server programs) or the process is killed by a signal (which
includes the possibility that it simply crashes because of an illegal memory access). So there is nothing
you can do to make sure a socket will never linger under all circumstances.
The question is, how does the system treat a socket in state TIME_WAIT? If SO_REUSEADDR is not set, a
socket in state TIME_WAIT is considered to still be bound to the source address and port and any attempt
to bind a new socket to the same address and port will fail until the socket has really been closed, which
may take as long as the configured Linger Time. So don't expect that you can rebind the source address
of a socket immediately after closing it. In most cases this will fail. However, if SO_REUSEADDR is set for
the socket you are trying to bind, another socket bound to the same address and port in
state TIME_WAIT is simply ignored, after all its already "half dead", and your socket can bind to exactly the
same address without any problem. In that case it plays no role that the other socket may have exactly the
same address and port. Note that binding a socket to exactly the same address and port as a dying socket
in TIME_WAIT state can have unexpected, and usually undesired, side effects in case the other socket is
still "at work", but that is beyond the scope of this answer and fortunately those side effects are rather rare
in practice.
There is one final thing you should know about SO_REUSEADDR. Everything written above will work as long
as the socket you want to bind to has address reuse enabled. It is not necessary that the other socket, the
one which is already bound or is in a TIME_WAIT state, also had this flag set when it was bound. The code
that decides if the bind will succeed or fail only inspects the SO_REUSEADDRflag of the socket fed into
the bind() call, for all other sockets inspected, this flag is not even looked at.
SO_REUSEPORT
SO_REUSEPORT is what most people would expect SO_REUSEADDR to be. Basically, SO_REUSEPORTallows
you to bind an arbitrary number of sockets to exactly the same source address and port as long
as all prior bound sockets also had SO_REUSEPORT set before they were bound. If the first socket that is
bound to an address and port does not have SO_REUSEPORT set, no other socket can be bound to exactly
the same address and port, regardless if this other socket has SO_REUSEPORTset or not, until the first
socket releases its binding again. Unlike in case of SO_REUESADDR the code handling SO_REUSEPORT will
not only verify that the currently bound socket has SO_REUSEPORT set but it will also verify that the socket
with a conflicting address and port had SO_REUSEADDR set when it was bound.
SO_REUSEPORT does not imply SO_REUSEADDR. This means if a socket did not have SO_REUSEPORTset
when it was bound and another socket has SO_REUSEPORT set when it is bound to exactly the same
address and port, the bind fails, which is expected, but it also fails if the other socket is already dying and
is in TIME_WAIT state. To be able bind a socket to the same addresses and port as another socket
in TIME_WAIT state requires either SO_REUSEADDR to be set on that socket or SO_REUSEPORT must have
been set on both sockets prior to binding them. Of course it is allowed to set
both, SO_REUSEPORT and SO_REUSEADDR, on a socket.
There is not much more to say about SO_REUSEPORT other than that it was added later
than SO_REUSEADDR, that's why you will not find it in many socket implementations of other systems,
which "forked" the BSD code before this option was added, and that there was no way to bind two sockets
to exactly the same socket address in BSD prior to this option.
you would create two connected sockets, whose tuples are absolutely identical. This cannot work, at least
not for TCP connections (UDP connections are no real connections anyway). If data arrived for either one
of the two connections, the system could not tell which connection the data belongs to. At least the
destination address or destination port must be different for either connection, so that the system has no
problem to identify to which connection incoming data belongs to.
So if you bind two sockets of the same protocol to the same source address and port and try to connect
them both to the same destination address and port, connect() will actually fail with the
error EADDRINUSE for the second socket you try to connect, which means that a socket with an identical
tuple of five values is already connected.
Multicast Addresses
Most people ignore the fact that multicast addresses exist, but they do exist. While unicast addresses are
used for one-to-one communication, multicast addresses are used for one-to-many communication. Most
people got aware of multicast addresses when they learned about IPv6 but multicast addresses also
existed in IPv4, even though this feature was never widely used on the public Internet.
The meaning of SO_REUSEADDR changes for multicast addresses as it allows multiple sockets to be
bound to exactly the same combination of source multicast address and port. In other words, for multicast
addresses SO_REUSEADDR behaves exactly as SO_REUSEPORT for unicast addresses. Actually the code
treats SO_REUSEADDR and SO_REUSEPORT identically for multicast addresses, that means you could say
that SO_REUSEADDR implies SO_REUSEPORT for all multicast addresses and the other way round.
FreeBSD/OpenBSD/NetBSD
All these are rather late forks of the original BSD code, that's why they all three offer the same options as
BSD and they also behave the same way as in BSD.
MacOS X
At its very core, MacOS X is simply a BSD-style UNIX, based on a rather late fork of the BSD code, which
was even synchronized with FreeBSD 5 for the Mac OS 10.3 release. That's why MacOS X offers the
same options as BSD and they also behave the same way as in BSD.
iOS
iOS is just modified MacOS X at its core, so everything that applies to MacOS X also applies to iOS.
Linux
Prior to Linux 3.9, only the option SO_REUSEADDR existed. This option behaves generally the as in BSD
with two important exceptions. One exception is that a if a listening (server) TCP socket is already bound
to a wildcard IP address and a specific port, no other TCP socket can be bound to the same port,
regardless whether either one or both sockets have this flag set. Not even if it would use a more specific
address (as is allowed in case of BSD). This restriction does not apply to non-listening (client) TCP
sockets and it is also possible to first bind a listening TCP socket to a specific IP address and port
combination and later on bind another one to a wildcard IP address and the same port. The second
exception is that for UDP sockets this option behaves exactly like SO_REUSEPORT in BSD, so two UDP
sockets can be bound to exactly the same address and port combination as long as both had this flag set
before they were bound.
Linux 3.9 added the option SO_REUSEPORT to Linux as well. This option allows two (or more) sockets, TCP
or UDP, listening (server) or non-listening (client), to be bound to exactly the same address and port
combination as long as all sockets (including the very first one) had this flag set prior to binding them. To
prevent "port hijacking", there is one special limitation, though: All sockets that want to share the same
address and port combination must belong to processes that share the same effective user ID! So one
user cannot "steal" ports of another user. Additionally the kernel performs some "special magic"
for SO_REUSEPORT sockets that isn't found in any other operating system so far: For UDP sockets, it tries
to distribute datagrams evenly, for TCP listening sockets, it tries to distribute incoming connect requests
(those accepted by calling accept()) evenly across all the sockets that share the same address and port
combination. That means while it is more or less random which socket receives a datagram or connect
request in other operating systems that allow full address reuse, Linux tries to optimize distribution so that,
for example, multiple instances of a simple server process can easily use SO_REUSEPORT sockets to
achieve a kind of simple load balancing and that absolutely for free as the kernel is doing "all the hard
work" for them.
Android
Even though the whole Android system is somewhat different from most Linux distributions, at its core
works a slightly modified Linux kernel, thus everything that applies to Linux applies to Android as well.
Windows
Windows only knows the SO_REUSEADDR option, there is no SO_REUSEPORT. Setting SO_REUSEADDRon a
socket in Windows behaves like setting SO_REUSEPORT and SO_REUSEADDR on a socket in BSD, with one
exception: A socket with SO_REUSEADDR can always bind to exactly the same source address and port as
an already bound socket, even if the other socket did not have this option set when it was bound.
This behavior is somewhat dangerous because it allows an application "to steal" the connected port of
another application. Needless to say, this can have major security implications. Microsoft realized that this
might be a problem and thus added another socket option SO_EXCLUSIVEADDRUSE.
Setting SO_EXCLUSIVEADDRUSE on a socket makes sure that if the binding succeeds, the combination of
source address and port is owned exclusively by this socket and no other socket can bind to them, not
even if it has SO_REUSEADDR set.
Solaris
Solaris is the successor of SunOS. SunOS was originally based on a fork of BSD, SunOS 5 and later was
based on a fork of SVR4, however SVR4 is a merge of BSD, System V, and Xenix, so up to some degree
Solaris is also a BSD fork, and a rather early one. As a result Solaris only knows SO_REUSEADDR, there is
no SO_REUSEPORT. The SO_REUSEADDR behaves pretty much the same as it does in BSD. As far as I
know there is no way to get the same behavior as SO_REUSEPORT in Solaris, that means it is not possible
to bind two address to exactly the same address and port.
Similar to Windows, Solaris has an option to give a socket an exclusive binding. This option is
named SO_EXCLBIND. If this option is set on a socket prior to binding it, setting SO_REUSEADDR on another
socket has no effect if the two sockets are tested for an address conflict. E.g. if socketA is bound to a
wildcard address and socketB has SO_REUSEADDR enabled and is bound to a non-wildcard address and
the same port as socketA, this bind will normally succeed, unless socketAhad SO_EXCLBIND enabled, in
which case it will fail regardless the SO_REUSEADDR flag of socketB.