summaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
authorEric Dumazet <dada1@cosmosbay.com>2008-11-20 00:40:07 -0800
committerDavid S. Miller <davem@davemloft.net>2008-11-20 00:40:07 -0800
commit5caea4ea7088e80ac5410d04660346094608b909 (patch)
treefad95133683c002d24ff5de7fb756dad806b41ed /net/ipv4
parentd8b83c57a7e497cba9b5cb156e63176323035785 (diff)
downloadlinux-5caea4ea7088e80ac5410d04660346094608b909.tar.gz
linux-5caea4ea7088e80ac5410d04660346094608b909.tar.bz2
linux-5caea4ea7088e80ac5410d04660346094608b909.zip
net: listening_hash get a spinlock per bucket
This patch prepares RCU migration of listening_hash table for TCP/DCCP protocols. listening_hash table being small (32 slots per protocol), we add a spinlock for each slot, instead of a single rwlock for whole table. This should reduce hold time of readers, and writers concurrency. Signed-off-by: Eric Dumazet <dada1@cosmosbay.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/inet_diag.c12
-rw-r--r--net/ipv4/inet_hashtables.c86
-rw-r--r--net/ipv4/tcp_ipv4.c24
3 files changed, 50 insertions, 72 deletions
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 41b36720e977..1cb154ed75ad 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -718,13 +718,15 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
if (!(r->idiag_states & (TCPF_LISTEN | TCPF_SYN_RECV)))
goto skip_listen_ht;
- inet_listen_lock(hashinfo);
for (i = s_i; i < INET_LHTABLE_SIZE; i++) {
struct sock *sk;
struct hlist_node *node;
+ struct inet_listen_hashbucket *ilb;
num = 0;
- sk_for_each(sk, node, &hashinfo->listening_hash[i]) {
+ ilb = &hashinfo->listening_hash[i];
+ spin_lock_bh(&ilb->lock);
+ sk_for_each(sk, node, &ilb->head) {
struct inet_sock *inet = inet_sk(sk);
if (num < s_num) {
@@ -742,7 +744,7 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
goto syn_recv;
if (inet_csk_diag_dump(sk, skb, cb) < 0) {
- inet_listen_unlock(hashinfo);
+ spin_unlock_bh(&ilb->lock);
goto done;
}
@@ -751,7 +753,7 @@ syn_recv:
goto next_listen;
if (inet_diag_dump_reqs(skb, sk, cb) < 0) {
- inet_listen_unlock(hashinfo);
+ spin_unlock_bh(&ilb->lock);
goto done;
}
@@ -760,12 +762,12 @@ next_listen:
cb->args[4] = 0;
++num;
}
+ spin_unlock_bh(&ilb->lock);
s_num = 0;
cb->args[3] = 0;
cb->args[4] = 0;
}
- inet_listen_unlock(hashinfo);
skip_listen_ht:
cb->args[0] = 1;
s_i = num = s_num = 0;
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index fd269cfef0ec..377d004e5723 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -111,35 +111,6 @@ void __inet_inherit_port(struct sock *sk, struct sock *child)
EXPORT_SYMBOL_GPL(__inet_inherit_port);
/*
- * This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
- * Look, when several writers sleep and reader wakes them up, all but one
- * immediately hit write lock and grab all the cpus. Exclusive sleep solves
- * this, _but_ remember, it adds useless work on UP machines (wake up each
- * exclusive lock release). It should be ifdefed really.
- */
-void inet_listen_wlock(struct inet_hashinfo *hashinfo)
- __acquires(hashinfo->lhash_lock)
-{
- write_lock(&hashinfo->lhash_lock);
-
- if (atomic_read(&hashinfo->lhash_users)) {
- DEFINE_WAIT(wait);
-
- for (;;) {
- prepare_to_wait_exclusive(&hashinfo->lhash_wait,
- &wait, TASK_UNINTERRUPTIBLE);
- if (!atomic_read(&hashinfo->lhash_users))
- break;
- write_unlock_bh(&hashinfo->lhash_lock);
- schedule();
- write_lock_bh(&hashinfo->lhash_lock);
- }
-
- finish_wait(&hashinfo->lhash_wait, &wait);
- }
-}
-
-/*
* Don't inline this cruft. Here are some nice properties to exploit here. The
* BSD API does not allow a listening sock to specify the remote port nor the
* remote address for the connection. So always assume those are both
@@ -191,25 +162,25 @@ struct sock *__inet_lookup_listener(struct net *net,
const int dif)
{
struct sock *sk = NULL;
- const struct hlist_head *head;
+ struct inet_listen_hashbucket *ilb;
- read_lock(&hashinfo->lhash_lock);
- head = &hashinfo->listening_hash[inet_lhashfn(net, hnum)];
- if (!hlist_empty(head)) {
- const struct inet_sock *inet = inet_sk((sk = __sk_head(head)));
+ ilb = &hashinfo->listening_hash[inet_lhashfn(net, hnum)];
+ spin_lock(&ilb->lock);
+ if (!hlist_empty(&ilb->head)) {
+ const struct inet_sock *inet = inet_sk((sk = __sk_head(&ilb->head)));
if (inet->num == hnum && !sk->sk_node.next &&
(!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
(sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
!sk->sk_bound_dev_if && net_eq(sock_net(sk), net))
goto sherry_cache;
- sk = inet_lookup_listener_slow(net, head, daddr, hnum, dif);
+ sk = inet_lookup_listener_slow(net, &ilb->head, daddr, hnum, dif);
}
if (sk) {
sherry_cache:
sock_hold(sk);
}
- read_unlock(&hashinfo->lhash_lock);
+ spin_unlock(&ilb->lock);
return sk;
}
EXPORT_SYMBOL_GPL(__inet_lookup_listener);
@@ -389,8 +360,7 @@ EXPORT_SYMBOL_GPL(__inet_hash_nolisten);
static void __inet_hash(struct sock *sk)
{
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
- struct hlist_head *list;
- rwlock_t *lock;
+ struct inet_listen_hashbucket *ilb;
if (sk->sk_state != TCP_LISTEN) {
__inet_hash_nolisten(sk);
@@ -398,14 +368,12 @@ static void __inet_hash(struct sock *sk)
}
WARN_ON(!sk_unhashed(sk));
- list = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
- lock = &hashinfo->lhash_lock;
+ ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
- inet_listen_wlock(hashinfo);
- __sk_add_node(sk, list);
+ spin_lock(&ilb->lock);
+ __sk_add_node(sk, &ilb->head);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
- write_unlock(lock);
- wake_up(&hashinfo->lhash_wait);
+ spin_unlock(&ilb->lock);
}
void inet_hash(struct sock *sk)
@@ -420,29 +388,27 @@ EXPORT_SYMBOL_GPL(inet_hash);
void inet_unhash(struct sock *sk)
{
- rwlock_t *lock;
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
if (sk_unhashed(sk))
- goto out;
+ return;
if (sk->sk_state == TCP_LISTEN) {
- local_bh_disable();
- inet_listen_wlock(hashinfo);
- lock = &hashinfo->lhash_lock;
+ struct inet_listen_hashbucket *ilb;
+
+ ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
+ spin_lock_bh(&ilb->lock);
if (__sk_del_node_init(sk))
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+ spin_unlock_bh(&ilb->lock);
} else {
- lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
+ rwlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
+
write_lock_bh(lock);
if (__sk_nulls_del_node_init_rcu(sk))
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+ write_unlock_bh(lock);
}
-
- write_unlock_bh(lock);
-out:
- if (sk->sk_state == TCP_LISTEN)
- wake_up(&hashinfo->lhash_wait);
}
EXPORT_SYMBOL_GPL(inet_unhash);
@@ -556,3 +522,13 @@ int inet_hash_connect(struct inet_timewait_death_row *death_row,
}
EXPORT_SYMBOL_GPL(inet_hash_connect);
+
+void inet_hashinfo_init(struct inet_hashinfo *h)
+{
+ int i;
+
+ for (i = 0; i < INET_LHTABLE_SIZE; i++)
+ spin_lock_init(&h->listening_hash[i].lock);
+}
+
+EXPORT_SYMBOL_GPL(inet_hashinfo_init);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 5559fea61e87..330b08a12274 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -97,11 +97,7 @@ struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
}
#endif
-struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
- .lhash_lock = __RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock),
- .lhash_users = ATOMIC_INIT(0),
- .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
-};
+struct inet_hashinfo tcp_hashinfo;
static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
{
@@ -1874,15 +1870,18 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
struct inet_connection_sock *icsk;
struct hlist_node *node;
struct sock *sk = cur;
+ struct inet_listen_hashbucket *ilb;
struct tcp_iter_state *st = seq->private;
struct net *net = seq_file_net(seq);
if (!sk) {
st->bucket = 0;
- sk = sk_head(&tcp_hashinfo.listening_hash[0]);
+ ilb = &tcp_hashinfo.listening_hash[0];
+ spin_lock_bh(&ilb->lock);
+ sk = sk_head(&ilb->head);
goto get_sk;
}
-
+ ilb = &tcp_hashinfo.listening_hash[st->bucket];
++st->num;
if (st->state == TCP_SEQ_STATE_OPENREQ) {
@@ -1932,8 +1931,11 @@ start_req:
}
read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
}
+ spin_unlock_bh(&ilb->lock);
if (++st->bucket < INET_LHTABLE_SIZE) {
- sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
+ ilb = &tcp_hashinfo.listening_hash[st->bucket];
+ spin_lock_bh(&ilb->lock);
+ sk = sk_head(&ilb->head);
goto get_sk;
}
cur = NULL;
@@ -2066,12 +2068,10 @@ static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
void *rc;
struct tcp_iter_state *st = seq->private;
- inet_listen_lock(&tcp_hashinfo);
st->state = TCP_SEQ_STATE_LISTENING;
rc = listening_get_idx(seq, &pos);
if (!rc) {
- inet_listen_unlock(&tcp_hashinfo);
st->state = TCP_SEQ_STATE_ESTABLISHED;
rc = established_get_idx(seq, pos);
}
@@ -2103,7 +2103,6 @@ static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
case TCP_SEQ_STATE_LISTENING:
rc = listening_get_next(seq, v);
if (!rc) {
- inet_listen_unlock(&tcp_hashinfo);
st->state = TCP_SEQ_STATE_ESTABLISHED;
rc = established_get_first(seq);
}
@@ -2130,7 +2129,7 @@ static void tcp_seq_stop(struct seq_file *seq, void *v)
}
case TCP_SEQ_STATE_LISTENING:
if (v != SEQ_START_TOKEN)
- inet_listen_unlock(&tcp_hashinfo);
+ spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
break;
case TCP_SEQ_STATE_TIME_WAIT:
case TCP_SEQ_STATE_ESTABLISHED:
@@ -2405,6 +2404,7 @@ static struct pernet_operations __net_initdata tcp_sk_ops = {
void __init tcp_v4_init(void)
{
+ inet_hashinfo_init(&tcp_hashinfo);
if (register_pernet_device(&tcp_sk_ops))
panic("Failed to create the TCP control socket.\n");
}