Sunday, 9 November 2008

The Ephemeral Ports Problem (and solution)

Richard Jones ran into a problem doing load tests with many open sockets. It was quite an interesting thing to investigate. For reasons still unknown to me after many hours of reading kernel code, documentation, and mailing lists, Linux shares the assigned list of ephemeral ports across all local IPs for unconnected sockets. I hope this post will save some time to someone and there is a workaround suggestion for libevent. (By the way RJ, you were right on this problem! Though I differ now on the solution... Sorry, can't help it, it seems :)

Ephemeral Ports

Ephemeral ports are an assigned range of IP ports to be used by unprivileged processes. This range is usually a few thousand ports above 1024. This range can be modified by the administrator by modifying the relevant sysctl variable. As you probably know, transport protocols in the TCP/IP suite use ports, and when a program wants to initiate communication with this protocols it needs to ask for a local address, and if not done explicitly the OS assigns one automatically. The problem lies on the lookup of available ports.

The operating system tracks the port numbers in use and also the ones in use recently (to know how to handle leftover incoming network packets.) In Linux, this is done with a technique called hash table. The code in the Linux kernel for TCP/IP networking is quite complicated, lacks documentation or comments, and is hard to track what is defined where. After many days banging my head against the crude code, I finally got it. Random posts on internet and a side comment on a standard draft said the ephemeral range was shared for all local addresses on most operating systems. I wanted to know where, how, and if possible, why. So far, I only got the first two and only hints of the last.

Ephemeral Port Assignment

The pattern to create a TCP socket for client software is to call:
int  sock_fd = socket(AF_INET, SOCK_STREAM, 0);
This will make a socket of TCP/IP famiy, of stream type (connected), of default protocol ("0" in this case, TCP.) After this you can manually assign it to a local address by calling:
bind(sock_fd, local_addr, local_addr_length);
That address should contain both the IP address and the port. If the port specified is 0, the kernel looks up for an available port in the ephemeral range. After this you make the actual connection to the server with:
connect(sock_fd, destination_addr, destination_addr_length);
If the bind step was omitted the kernel's connect code does a similar, but slightly different lookup of available ports. Let's compare both lookups.

The bind lookup algorithm resides in net/ipv4/inet_connection_sock.c's function inet_csk_get_port():
/* Obtain a reference to a local port for the given sock,
* if snum is zero it means select any available local port.
*/
int inet_csk_get_port(struct sock *sk, unsigned short snum)
{
/* ... */
if (!snum) {
int remaining, rover, low, high;

inet_get_local_port_range(&low, &high);
remaining = (high - low) + 1;
rover = net_random() % remaining + low;

do {
head = &hashinfo->bhash[inet_bhashfn(net, rover,
hashinfo->bhash_size)];
spin_lock(&head->lock);
inet_bind_bucket_for_each(tb, node, &head->chain)
if (tb->ib_net == net && tb->port == rover)
goto next;
break;
next:
spin_unlock(&head->lock);
if (++rover > high)
rover = low;
} while (--remaining > 0);

/* Exhausted local port range during search? It is not
* possible for us to be holding one of the bind hash
* locks if this test triggers, because if 'remaining'
* drops to zero, we broke out of the do/while loop at
* the top level, not from the 'break;' statement.
*/
ret = 1;
if (remaining <= 0)
goto fail;

/* OK, here is the one we will use. HEAD is
* non-NULL and we hold it's mutex.
*/
snum = rover;
} else {
When snum is 0, it looks for an available bucket in the hash table, but if there is anything in it (any socket using that port, or recently closed) it keeps looking. If the search hits the end, the function fails. To note, there is no use of local IP address in the hash table! The net thing passed isn't forthat. The hash table only cares of port numbers. In contrast, the port lookup on connect in net/ipv4/inet_hashtables.c does:
int __inet_hash_connect(struct inet_timewait_death_row *death_row,
struct sock *sk, u32 port_offset,
int (*check_established)(struct inet_timewait_death_row *,
struct sock *, __u16, struct inet_timewait_sock **),
void (*hash)(struct sock *sk))
{
/* ... */
if (!snum) {
int i, remaining, low, high, port;
static u32 hint;
u32 offset = hint + port_offset;
struct hlist_node *node;
struct inet_timewait_sock *tw = NULL;

inet_get_local_port_range(&low, &high);
remaining = (high - low) + 1;

local_bh_disable();
for (i = 1; i <= remaining; i++) {
port = low + (i + offset) % remaining;
head = &hinfo->bhash[inet_bhashfn(net, port,
hinfo->bhash_size)];
spin_lock(&head->lock);

/* Does not bother with rcv_saddr checks,
* because the established check is already
* unique enough.
*/
inet_bind_bucket_for_each(tb, node, &head->chain) {
if (tb->ib_net == net && tb->port == port) {
WARN_ON(hlist_empty(&tb->owners));
if (tb->fastreuse >= 0)
goto next_port;
if (!check_established(death_row, sk,
port, &tw))
goto ok;
goto next_port;
}
}

tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
net, head, port);
if (!tb) {
spin_unlock(&head->lock);
break;
}
tb->fastreuse = -1;
goto ok;

next_port:
spin_unlock(&head->lock);
}
local_bh_enable();

return -EADDRNOTAVAIL;

ok:
/* ... */
The algorithm is quite similar but if the hash table bucket for the port is in use, it calls check_established() to perform further checks:

/* called with local bh disabled */
static int __inet_check_established(struct inet_timewait_death_row *death_row,
struct sock *sk, __u16 lport,
struct inet_timewait_sock **twp)
{
/* ... */
/* Check TIME-WAIT sockets first. */
sk_for_each(sk2, node, &head->twchain) {
tw = inet_twsk(sk2);

if (INET_TW_MATCH(sk2, net, hash, acookie,
saddr, daddr, ports, dif)) {
if (twsk_unique(sk, sk2, twp))
goto unique;
else
goto not_unique;
}
}
tw = NULL;

/* And established part... */
sk_for_each(sk2, node, &head->chain) {
if (INET_MATCH(sk2, net, hash, acookie,
saddr, daddr, ports, dif))
goto not_unique;
}

unique:
/* Must record num and sport now. Otherwise we will see
* in hash table socket with a funny identity. */
inet->num = lport;
inet->sport = htons(lport);
sk->sk_hash = hash;
WARN_ON(!sk_unhashed(sk));
__sk_add_node(sk, &head->chain);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
write_unlock(lock);

if (twp) {
*twp = tw;
NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED);
} else if (tw) {
/* Silly. Should hash-dance instead... */
inet_twsk_deschedule(tw, death_row);
NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED);

inet_twsk_put(tw);
}

return 0;

not_unique:
write_unlock(lock);
return -EADDRNOTAVAIL;
}
This allows to reuse the same local port as long as the 5-tuple (protocol, source address, source port, destination address, destination port) doesn't exist already (the INET_MATCH call.)

Catch 22

So there is a dilemma on how to create more client TCP sockets than the number of available ephemeral ports (let's call n_sockets and n_ephemeral.)
  • Increasing n_sockets by having with multiple source IP addresses (the RJ approach) won't work, because it will fail on the lookup of available ephemeral ports (it doesn't care about the source address.)
  • If you make just a connect call you get limited to n_ephemeral because the lookup isn't for IP and ephemeral port, it's just a lookup of a port within a local IP (as noted on the comment above.)
[Note: there is no way to do an incomplete bind of only the IP address part and leaving the port to be assigned for later.

After this situation RJ offered a patch to libevent to do it the way httpperf does, binding local address and port. This means the client code has to do the port allocation lookup and if not carefully managed it will be an incredible amount of work on tries to call bind(). In my opinion, this is hackish and ugly. It's not their fault, they were cornered by poor implementations and poor interfaces. In RJ's case libevent always calls bind before connect so there isn't even a chance to do it right, as it is.

Also I didn't like the idea of having to bother the user to have more local addresses and having to pass that to the client program.

My $.02

As a programmer, one way to allow so many connections to a server from a single host would be to instead increase the number of ports the server is listening. This is very common and should be trivial to do and scales very well (n_ephemeral times the number of server ports.) The only limitation is if there is a firewall or some other kind of filter but it is quite unlikely. In this particular case this would require a modification of libevent, to prevent it from calling bind() before connect if no local address is specified (for client code.) This is in effect a four line patch and no change of libevent API (RJ's diff adds another API function and is about 16 lines):
--- http.c      2008-09-08 01:11:13.000000000 +0100
+++ http.c.new 2008-11-13 02:09:12.000000000 +0000
@@ -1731,7 +1731,10 @@
assert(!(evcon->flags & EVHTTP_CON_INCOMING));
evcon->flags |= EVHTTP_CON_OUTGOING;

- evcon->fd = bind_socket(evcon->bind_address, 0 /*port*/, 0 /*reuse*/);
+ if (evcon->bind_address)
+ evcon->fd = bind_socket(evcon->bind_address, 0 /*port*/, 0 /*reuse*/);
+ else
+ evcon->fd = socket(AF_INET, SOCK_STREAM, 0); /* generic socket */
if (evcon->fd == -1) {
event_debug(("%s: failed to bind to \"%s\"",
__func__, evcon->bind_address));
(Yes, I already mailed Niels a few days ago about it. But as usual, he'll probably have a better way to do it. Hi Niels ;)

Trying to Make Sense of the Kernel Algorithm

Why are ephemeral ports searched this way? Why is bind() so strict? Well, at that point:
  • The kernel only knows it is a TCP socket.
  • It doesn't know if it is going to be a client or server (listen) socket.
  • And even if it knew it is a client, it wouldn't know yet the destination address and port.
Some peripheral comments on the subject on Linux kernel mailing list mention the issues of strange things like double connects (valid in TCP.) I am still not convinced this isn't just an archaic lookup that doesn't consider the local address. This issue is discussed by Fernando Gont (a fellow UTN-er, what a coincidence.) in his IETF draft. This was made earlier this year (February 2008) I guess for working out the issues with port prediction (like Dan Kaminsky's DNS bug.) Very interesting read.

Extra


Here is some code to play with:
/*
Copyright (C) 2008 Alejo Sanchez

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.

You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/resource.h>
#include <errno.h>
#include <netdb.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <unistd.h>

const char *addrs[] = { "127.0.0.1", "127.0.0.2" };

int
main (int argc, char **argv)
{
struct rlimit rl; /* to bump up system limits for this process */
int *sockets; /* ptr to array of sockets */
int nsockets = 120000;
int c, i;

while ((c = getopt(argc, argv, "n:")) != -1) {
switch (c) {
case 'n':
nsockets = atoi(optarg);
break;
default:
fprintf(stderr, "Illegal argument \"%c\"\n", c);
exit(1);
}
}

rl.rlim_cur = rl.rlim_max = nsockets + 10;
if (setrlimit(RLIMIT_NOFILE, &rl) == -1) {
perror("setrlimit");
exit(1);
}

if ((sockets = (int *) malloc(nsockets * 2 * sizeof(int))) == NULL) {
perror("malloc");
exit(1);
}

for (i = 0; i < nsockets; i++) {
#ifdef BIND_ONLY
struct addrinfo *aitop, ai_hints = { .ai_family = AF_INET,
.ai_socktype = SOCK_STREAM, .ai_flags = AI_PASSIVE };
const char *addr = addrs[i % (sizeof(addrs) / sizeof (addrs[0]))];
const char *portstr = "0";

getaddrinfo(addr, portstr, &ai_hints, &aitop);

sockets[i] = socket(AF_INET, SOCK_STREAM, 0);

if (bind(sockets[i], aitop->ai_addr, aitop->ai_addrlen) == -1) {
fprintf(stderr, "Error binding %s, for %s : %d\n",
strerror(errno), addr, portstr);
} else
fprintf(stderr, "ok addr: %s, i: %d\n", addr, i);
#else
struct addrinfo *aitop, ai_hints = { .ai_family = AF_INET,
.ai_socktype = SOCK_STREAM, .ai_flags = AI_PASSIVE };
char portstr[20];

snprintf(portstr, sizeof(portstr), "%d", 8080 + (i % 4));
getaddrinfo("127.0.0.1", portstr, &ai_hints, &aitop); /* dst */
sockets[i] = socket(AF_INET, SOCK_STREAM, 0);

if (connect(sockets[i], aitop->ai_addr, aitop->ai_addrlen) == -1) {
fprintf(stderr, "Error connecting %s, for port %d\n",
strerror(errno), portstr);
}
#endif

free(aitop);
}

printf("%i socket pairs created, check memory. Sleeping 10 sec.\n", i);
sleep(10);

exit(0);
}
 
Creative Commons License
This work is licensed under a Creative Commons Attribution-Noncommercial-Share Alike 3.0 United States License.