From 8ea1eebb49a2dfee1dce621a638cc1626e542392 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 11 May 2022 17:05:52 -0700 Subject: net: inet: Remove count from inet_listen_hashbucket After commit 0ee58dad5b06 ("net: tcp6: prefer listeners bound to an address") and commit d9fbc7f6431f ("net: tcp: prefer listeners bound to an address"), the count is no longer used. This patch removes it. Signed-off-by: Martin KaFai Lau Reviewed-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/net/inet_hashtables.h | 1 - net/ipv4/inet_hashtables.c | 6 ------ 2 files changed, 7 deletions(-) diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 98e1ec1a14f0..103fc7a30e60 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -111,7 +111,6 @@ struct inet_bind_hashbucket { #define LISTENING_NULLS_BASE (1U << 29) struct inet_listen_hashbucket { spinlock_t lock; - unsigned int count; union { struct hlist_head head; struct hlist_nulls_head nulls_head; diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index a5d57fa679ca..968b15cc0a18 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -209,7 +209,6 @@ static void inet_hash2(struct inet_hashinfo *h, struct sock *sk) else hlist_add_head_rcu(&inet_csk(sk)->icsk_listen_portaddr_node, &ilb2->head); - ilb2->count++; spin_unlock(&ilb2->lock); } @@ -225,7 +224,6 @@ static void inet_unhash2(struct inet_hashinfo *h, struct sock *sk) spin_lock(&ilb2->lock); hlist_del_init_rcu(&inet_csk(sk)->icsk_listen_portaddr_node); - ilb2->count--; spin_unlock(&ilb2->lock); } @@ -657,7 +655,6 @@ int __inet_hash(struct sock *sk, struct sock *osk) else __sk_nulls_add_node_rcu(sk, &ilb->nulls_head); inet_hash2(hashinfo, sk); - ilb->count++; sock_set_flag(sk, SOCK_RCU_FREE); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); unlock: @@ -689,7 +686,6 @@ static void __inet_unhash(struct sock *sk, struct inet_listen_hashbucket *ilb) struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; inet_unhash2(hashinfo, sk); - ilb->count--; } __sk_nulls_del_node_init_rcu(sk); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); @@ -882,7 +878,6 @@ void inet_hashinfo_init(struct inet_hashinfo *h) spin_lock_init(&h->listening_hash[i].lock); INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].nulls_head, i + LISTENING_NULLS_BASE); - h->listening_hash[i].count = 0; } h->lhash2 = NULL; @@ -896,7 +891,6 @@ static void init_hashinfo_lhash2(struct inet_hashinfo *h) for (i = 0; i <= h->lhash2_mask; i++) { spin_lock_init(&h->lhash2[i].lock); INIT_HLIST_HEAD(&h->lhash2[i].head); - h->lhash2[i].count = 0; } } -- cgit v1.2.3 From e8d0059000b20c4745c5b6a713f6adb269cff8ff Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 11 May 2022 17:05:58 -0700 Subject: net: inet: Open code inet_hash2 and inet_unhash2 This patch folds lhash2 related functions into __inet_hash and inet_unhash. This will make the removal of the listening_hash in a latter patch easier to review. First, this patch folds inet_hash2 into __inet_hash. For unhash, the current call sequence is like inet_unhash() => __inet_unhash() => inet_unhash2(). The specific testing cases in __inet_unhash() are mostly related to TCP_LISTEN sk and its caller inet_unhash() already has the TCP_LISTEN test, so this patch folds both __inet_unhash() and inet_unhash2() into inet_unhash(). Note that all listening_hash users also have lhash2 initialized, so the !h->lhash2 check is no longer needed. Signed-off-by: Martin KaFai Lau Reviewed-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- net/ipv4/inet_hashtables.c | 88 +++++++++++++++++----------------------------- 1 file changed, 33 insertions(+), 55 deletions(-) diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 968b15cc0a18..4b00fcb9088d 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -193,40 +193,6 @@ inet_lhash2_bucket_sk(struct inet_hashinfo *h, struct sock *sk) return inet_lhash2_bucket(h, hash); } -static void inet_hash2(struct inet_hashinfo *h, struct sock *sk) -{ - struct inet_listen_hashbucket *ilb2; - - if (!h->lhash2) - return; - - ilb2 = inet_lhash2_bucket_sk(h, sk); - - spin_lock(&ilb2->lock); - if (sk->sk_reuseport && sk->sk_family == AF_INET6) - hlist_add_tail_rcu(&inet_csk(sk)->icsk_listen_portaddr_node, - &ilb2->head); - else - hlist_add_head_rcu(&inet_csk(sk)->icsk_listen_portaddr_node, - &ilb2->head); - spin_unlock(&ilb2->lock); -} - -static void inet_unhash2(struct inet_hashinfo *h, struct sock *sk) -{ - struct inet_listen_hashbucket *ilb2; - - if (!h->lhash2 || - WARN_ON_ONCE(hlist_unhashed(&inet_csk(sk)->icsk_listen_portaddr_node))) - return; - - ilb2 = inet_lhash2_bucket_sk(h, sk); - - spin_lock(&ilb2->lock); - hlist_del_init_rcu(&inet_csk(sk)->icsk_listen_portaddr_node); - spin_unlock(&ilb2->lock); -} - static inline int compute_score(struct sock *sk, struct net *net, const unsigned short hnum, const __be32 daddr, const int dif, const int sdif) @@ -631,6 +597,7 @@ static int inet_reuseport_add_sock(struct sock *sk, int __inet_hash(struct sock *sk, struct sock *osk) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; + struct inet_listen_hashbucket *ilb2; struct inet_listen_hashbucket *ilb; int err = 0; @@ -642,22 +609,29 @@ int __inet_hash(struct sock *sk, struct sock *osk) } WARN_ON(!sk_unhashed(sk)); ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; + ilb2 = inet_lhash2_bucket_sk(hashinfo, sk); spin_lock(&ilb->lock); + spin_lock(&ilb2->lock); if (sk->sk_reuseport) { err = inet_reuseport_add_sock(sk, ilb); if (err) goto unlock; } if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && - sk->sk_family == AF_INET6) + sk->sk_family == AF_INET6) { + hlist_add_tail_rcu(&inet_csk(sk)->icsk_listen_portaddr_node, + &ilb2->head); __sk_nulls_add_node_tail_rcu(sk, &ilb->nulls_head); - else + } else { + hlist_add_head_rcu(&inet_csk(sk)->icsk_listen_portaddr_node, + &ilb2->head); __sk_nulls_add_node_rcu(sk, &ilb->nulls_head); - inet_hash2(hashinfo, sk); + } sock_set_flag(sk, SOCK_RCU_FREE); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); unlock: + spin_unlock(&ilb2->lock); spin_unlock(&ilb->lock); return err; @@ -675,22 +649,6 @@ int inet_hash(struct sock *sk) } EXPORT_SYMBOL_GPL(inet_hash); -static void __inet_unhash(struct sock *sk, struct inet_listen_hashbucket *ilb) -{ - if (sk_unhashed(sk)) - return; - - if (rcu_access_pointer(sk->sk_reuseport_cb)) - reuseport_stop_listen_sock(sk); - if (ilb) { - struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; - - inet_unhash2(hashinfo, sk); - } - __sk_nulls_del_node_init_rcu(sk); - sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); -} - void inet_unhash(struct sock *sk) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; @@ -699,20 +657,40 @@ void inet_unhash(struct sock *sk) return; if (sk->sk_state == TCP_LISTEN) { + struct inet_listen_hashbucket *ilb2; struct inet_listen_hashbucket *ilb; ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; + ilb2 = inet_lhash2_bucket_sk(hashinfo, sk); /* Don't disable bottom halves while acquiring the lock to * avoid circular locking dependency on PREEMPT_RT. */ spin_lock(&ilb->lock); - __inet_unhash(sk, ilb); + spin_lock(&ilb2->lock); + if (sk_unhashed(sk)) { + spin_unlock(&ilb2->lock); + spin_unlock(&ilb->lock); + return; + } + + if (rcu_access_pointer(sk->sk_reuseport_cb)) + reuseport_stop_listen_sock(sk); + + hlist_del_init_rcu(&inet_csk(sk)->icsk_listen_portaddr_node); + __sk_nulls_del_node_init_rcu(sk); + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); + spin_unlock(&ilb2->lock); spin_unlock(&ilb->lock); } else { spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash); spin_lock_bh(lock); - __inet_unhash(sk, NULL); + if (sk_unhashed(sk)) { + spin_unlock_bh(lock); + return; + } + __sk_nulls_del_node_init_rcu(sk); + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); spin_unlock_bh(lock); } } -- cgit v1.2.3 From cae3873c5b3a4fcd9706fb461ff4e91bdf1f0120 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 11 May 2022 17:06:05 -0700 Subject: net: inet: Retire port only listening_hash The listen sk is currently stored in two hash tables, listening_hash (hashed by port) and lhash2 (hashed by port and address). After commit 0ee58dad5b06 ("net: tcp6: prefer listeners bound to an address") and commit d9fbc7f6431f ("net: tcp: prefer listeners bound to an address"), the TCP-SYN lookup fast path does not use listening_hash. The commit 05c0b35709c5 ("tcp: seq_file: Replace listening_hash with lhash2") also moved the seq_file (/proc/net/tcp) iteration usage from listening_hash to lhash2. There are still a few listening_hash usages left. One of them is inet_reuseport_add_sock() which uses the listening_hash to search a listen sk during the listen() system call. This turns out to be very slow on use cases that listen on many different VIPs at a popular port (e.g. 443). [ On top of the slowness in adding to the tail in the IPv6 case ]. The latter patch has a selftest to demonstrate this case. This patch takes this chance to move all remaining listening_hash usages to lhash2 and then retire listening_hash. Since most changes need to be done together, it is hard to cut the listening_hash to lhash2 switch into small patches. The changes in this patch is highlighted here for the review purpose. 1. Because of the listening_hash removal, lhash2 can use the sk->sk_nulls_node instead of the icsk->icsk_listen_portaddr_node. This will also keep the sk_unhashed() check to work as is after stop adding sk to listening_hash. The union is removed from inet_listen_hashbucket because only nulls_head is needed. 2. icsk->icsk_listen_portaddr_node and its helpers are removed. 3. The current lhash2 users needs to iterate with sk_nulls_node instead of icsk_listen_portaddr_node. One case is in the inet[6]_lhash2_lookup(). Another case is the seq_file iterator in tcp_ipv4.c. One thing to note is sk_nulls_next() is needed because the old inet_lhash2_for_each_icsk_continue() does a "next" first before iterating. 4. Move the remaining listening_hash usage to lhash2 inet_reuseport_add_sock() which this series is trying to improve. inet_diag.c and mptcp_diag.c are the final two remaining use cases and is moved to lhash2 now also. Signed-off-by: Martin KaFai Lau Reviewed-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/net/inet_connection_sock.h | 2 -- include/net/inet_hashtables.h | 41 +-------------------------------- net/dccp/proto.c | 1 - net/ipv4/inet_diag.c | 5 ++-- net/ipv4/inet_hashtables.c | 47 ++++++++------------------------------ net/ipv4/tcp.c | 1 - net/ipv4/tcp_ipv4.c | 21 ++++++++--------- net/ipv6/inet6_hashtables.c | 5 ++-- net/mptcp/mptcp_diag.c | 4 ++-- 9 files changed, 26 insertions(+), 101 deletions(-) diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 3908296d103f..85cd695e7fd1 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -66,7 +66,6 @@ struct inet_connection_sock_af_ops { * @icsk_ulp_ops Pluggable ULP control hook * @icsk_ulp_data ULP private data * @icsk_clean_acked Clean acked data hook - * @icsk_listen_portaddr_node hash to the portaddr listener hashtable * @icsk_ca_state: Congestion control state * @icsk_retransmits: Number of unrecovered [RTO] timeouts * @icsk_pending: Scheduled timer event @@ -96,7 +95,6 @@ struct inet_connection_sock { const struct tcp_ulp_ops *icsk_ulp_ops; void __rcu *icsk_ulp_data; void (*icsk_clean_acked)(struct sock *sk, u32 acked_seq); - struct hlist_node icsk_listen_portaddr_node; unsigned int (*icsk_sync_mss)(struct sock *sk, u32 pmtu); __u8 icsk_ca_state:5, icsk_ca_initialized:1, diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 103fc7a30e60..1b8706719d4f 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -111,10 +111,7 @@ struct inet_bind_hashbucket { #define LISTENING_NULLS_BASE (1U << 29) struct inet_listen_hashbucket { spinlock_t lock; - union { - struct hlist_head head; - struct hlist_nulls_head nulls_head; - }; + struct hlist_nulls_head nulls_head; }; /* This is for listening sockets, thus all sockets which possess wildcards. */ @@ -142,32 +139,8 @@ struct inet_hashinfo { /* The 2nd listener table hashed by local port and address */ unsigned int lhash2_mask; struct inet_listen_hashbucket *lhash2; - - /* All the above members are written once at bootup and - * never written again _or_ are predominantly read-access. - * - * Now align to a new cache line as all the following members - * might be often dirty. - */ - /* All sockets in TCP_LISTEN state will be in listening_hash. - * This is the only table where wildcard'd TCP sockets can - * exist. listening_hash is only hashed by local port number. - * If lhash2 is initialized, the same socket will also be hashed - * to lhash2 by port and address. - */ - struct inet_listen_hashbucket listening_hash[INET_LHTABLE_SIZE] - ____cacheline_aligned_in_smp; }; -#define inet_lhash2_for_each_icsk_continue(__icsk) \ - hlist_for_each_entry_continue(__icsk, icsk_listen_portaddr_node) - -#define inet_lhash2_for_each_icsk(__icsk, list) \ - hlist_for_each_entry(__icsk, list, icsk_listen_portaddr_node) - -#define inet_lhash2_for_each_icsk_rcu(__icsk, list) \ - hlist_for_each_entry_rcu(__icsk, list, icsk_listen_portaddr_node) - static inline struct inet_listen_hashbucket * inet_lhash2_bucket(struct inet_hashinfo *h, u32 hash) { @@ -229,23 +202,11 @@ static inline u32 inet_bhashfn(const struct net *net, const __u16 lport, void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, const unsigned short snum); -/* These can have wildcards, don't try too hard. */ -static inline u32 inet_lhashfn(const struct net *net, const unsigned short num) -{ - return (num + net_hash_mix(net)) & (INET_LHTABLE_SIZE - 1); -} - -static inline int inet_sk_listen_hashfn(const struct sock *sk) -{ - return inet_lhashfn(sock_net(sk), inet_sk(sk)->inet_num); -} - /* Caller must disable local BH processing. */ int __inet_inherit_port(const struct sock *sk, struct sock *child); void inet_put_port(struct sock *sk); -void inet_hashinfo_init(struct inet_hashinfo *h); void inet_hashinfo2_init(struct inet_hashinfo *h, const char *name, unsigned long numentries, int scale, unsigned long low_limit, diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 58421f94427e..eb8e128e43e8 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -1110,7 +1110,6 @@ static int __init dccp_init(void) BUILD_BUG_ON(sizeof(struct dccp_skb_cb) > sizeof_field(struct sk_buff, cb)); - inet_hashinfo_init(&dccp_hashinfo); rc = inet_hashinfo2_init_mod(&dccp_hashinfo); if (rc) goto out_fail; diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 581b5b2d72a5..b812eb36f0e3 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -1028,12 +1028,13 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, if (!(idiag_states & TCPF_LISTEN) || r->id.idiag_dport) goto skip_listen_ht; - for (i = s_i; i < INET_LHTABLE_SIZE; i++) { + for (i = s_i; i <= hashinfo->lhash2_mask; i++) { struct inet_listen_hashbucket *ilb; struct hlist_nulls_node *node; num = 0; - ilb = &hashinfo->listening_hash[i]; + ilb = &hashinfo->lhash2[i]; + spin_lock(&ilb->lock); sk_nulls_for_each(sk, node, &ilb->nulls_head) { struct inet_sock *inet = inet_sk(sk); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 4b00fcb9088d..0b8235fbd440 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -246,12 +246,11 @@ static struct sock *inet_lhash2_lookup(struct net *net, const __be32 daddr, const unsigned short hnum, const int dif, const int sdif) { - struct inet_connection_sock *icsk; struct sock *sk, *result = NULL; + struct hlist_nulls_node *node; int score, hiscore = 0; - inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) { - sk = (struct sock *)icsk; + sk_nulls_for_each_rcu(sk, node, &ilb2->nulls_head) { score = compute_score(sk, net, hnum, daddr, dif, sdif); if (score > hiscore) { result = lookup_reuseport(net, sk, skb, doff, @@ -598,7 +597,6 @@ int __inet_hash(struct sock *sk, struct sock *osk) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; struct inet_listen_hashbucket *ilb2; - struct inet_listen_hashbucket *ilb; int err = 0; if (sk->sk_state != TCP_LISTEN) { @@ -608,31 +606,23 @@ int __inet_hash(struct sock *sk, struct sock *osk) return 0; } WARN_ON(!sk_unhashed(sk)); - ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; ilb2 = inet_lhash2_bucket_sk(hashinfo, sk); - spin_lock(&ilb->lock); spin_lock(&ilb2->lock); if (sk->sk_reuseport) { - err = inet_reuseport_add_sock(sk, ilb); + err = inet_reuseport_add_sock(sk, ilb2); if (err) goto unlock; } if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && - sk->sk_family == AF_INET6) { - hlist_add_tail_rcu(&inet_csk(sk)->icsk_listen_portaddr_node, - &ilb2->head); - __sk_nulls_add_node_tail_rcu(sk, &ilb->nulls_head); - } else { - hlist_add_head_rcu(&inet_csk(sk)->icsk_listen_portaddr_node, - &ilb2->head); - __sk_nulls_add_node_rcu(sk, &ilb->nulls_head); - } + sk->sk_family == AF_INET6) + __sk_nulls_add_node_tail_rcu(sk, &ilb2->nulls_head); + else + __sk_nulls_add_node_rcu(sk, &ilb2->nulls_head); sock_set_flag(sk, SOCK_RCU_FREE); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); unlock: spin_unlock(&ilb2->lock); - spin_unlock(&ilb->lock); return err; } @@ -658,29 +648,23 @@ void inet_unhash(struct sock *sk) if (sk->sk_state == TCP_LISTEN) { struct inet_listen_hashbucket *ilb2; - struct inet_listen_hashbucket *ilb; - ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; ilb2 = inet_lhash2_bucket_sk(hashinfo, sk); /* Don't disable bottom halves while acquiring the lock to * avoid circular locking dependency on PREEMPT_RT. */ - spin_lock(&ilb->lock); spin_lock(&ilb2->lock); if (sk_unhashed(sk)) { spin_unlock(&ilb2->lock); - spin_unlock(&ilb->lock); return; } if (rcu_access_pointer(sk->sk_reuseport_cb)) reuseport_stop_listen_sock(sk); - hlist_del_init_rcu(&inet_csk(sk)->icsk_listen_portaddr_node); __sk_nulls_del_node_init_rcu(sk); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); spin_unlock(&ilb2->lock); - spin_unlock(&ilb->lock); } else { spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash); @@ -848,27 +832,14 @@ int inet_hash_connect(struct inet_timewait_death_row *death_row, } EXPORT_SYMBOL_GPL(inet_hash_connect); -void inet_hashinfo_init(struct inet_hashinfo *h) -{ - int i; - - for (i = 0; i < INET_LHTABLE_SIZE; i++) { - spin_lock_init(&h->listening_hash[i].lock); - INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].nulls_head, - i + LISTENING_NULLS_BASE); - } - - h->lhash2 = NULL; -} -EXPORT_SYMBOL_GPL(inet_hashinfo_init); - static void init_hashinfo_lhash2(struct inet_hashinfo *h) { int i; for (i = 0; i <= h->lhash2_mask; i++) { spin_lock_init(&h->lhash2[i].lock); - INIT_HLIST_HEAD(&h->lhash2[i].head); + INIT_HLIST_NULLS_HEAD(&h->lhash2[i].nulls_head, + i + LISTENING_NULLS_BASE); } } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index b44fde435bd1..028513d3e2a2 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -4595,7 +4595,6 @@ void __init tcp_init(void) timer_setup(&tcp_orphan_timer, tcp_orphan_update, TIMER_DEFERRABLE); mod_timer(&tcp_orphan_timer, jiffies + TCP_ORPHAN_TIMER_PERIOD); - inet_hashinfo_init(&tcp_hashinfo); inet_hashinfo2_init(&tcp_hashinfo, "tcp_listen_portaddr_hash", thash_entries, 21, /* one slot per 2 MB*/ 0, 64 * 1024); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 918816ec5dd4..218ad871c0e4 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2283,16 +2283,15 @@ static void *listening_get_first(struct seq_file *seq) st->offset = 0; for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) { struct inet_listen_hashbucket *ilb2; - struct inet_connection_sock *icsk; + struct hlist_nulls_node *node; struct sock *sk; ilb2 = &tcp_hashinfo.lhash2[st->bucket]; - if (hlist_empty(&ilb2->head)) + if (hlist_nulls_empty(&ilb2->nulls_head)) continue; spin_lock(&ilb2->lock); - inet_lhash2_for_each_icsk(icsk, &ilb2->head) { - sk = (struct sock *)icsk; + sk_nulls_for_each(sk, node, &ilb2->nulls_head) { if (seq_sk_match(seq, sk)) return sk; } @@ -2311,15 +2310,14 @@ static void *listening_get_next(struct seq_file *seq, void *cur) { struct tcp_iter_state *st = seq->private; struct inet_listen_hashbucket *ilb2; - struct inet_connection_sock *icsk; + struct hlist_nulls_node *node; struct sock *sk = cur; ++st->num; ++st->offset; - icsk = inet_csk(sk); - inet_lhash2_for_each_icsk_continue(icsk) { - sk = (struct sock *)icsk; + sk = sk_nulls_next(sk); + sk_nulls_for_each_from(sk, node) { if (seq_sk_match(seq, sk)) return sk; } @@ -2728,16 +2726,15 @@ static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq, { struct bpf_tcp_iter_state *iter = seq->private; struct tcp_iter_state *st = &iter->state; - struct inet_connection_sock *icsk; + struct hlist_nulls_node *node; unsigned int expected = 1; struct sock *sk; sock_hold(start_sk); iter->batch[iter->end_sk++] = start_sk; - icsk = inet_csk(start_sk); - inet_lhash2_for_each_icsk_continue(icsk) { - sk = (struct sock *)icsk; + sk = sk_nulls_next(start_sk); + sk_nulls_for_each_from(sk, node) { if (seq_sk_match(seq, sk)) { if (iter->end_sk < iter->max_sk) { sock_hold(sk); diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 32ccac10bd62..a758f2ab7b51 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -138,12 +138,11 @@ static struct sock *inet6_lhash2_lookup(struct net *net, const __be16 sport, const struct in6_addr *daddr, const unsigned short hnum, const int dif, const int sdif) { - struct inet_connection_sock *icsk; struct sock *sk, *result = NULL; + struct hlist_nulls_node *node; int score, hiscore = 0; - inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) { - sk = (struct sock *)icsk; + sk_nulls_for_each_rcu(sk, node, &ilb2->nulls_head) { score = compute_score(sk, net, hnum, daddr, dif, sdif); if (score > hiscore) { result = lookup_reuseport(net, sk, skb, doff, diff --git a/net/mptcp/mptcp_diag.c b/net/mptcp/mptcp_diag.c index dbb6d876a203..7f9a71780437 100644 --- a/net/mptcp/mptcp_diag.c +++ b/net/mptcp/mptcp_diag.c @@ -83,13 +83,13 @@ static void mptcp_diag_dump_listeners(struct sk_buff *skb, struct netlink_callba struct net *net = sock_net(skb->sk); int i; - for (i = diag_ctx->l_slot; i < INET_LHTABLE_SIZE; i++) { + for (i = diag_ctx->l_slot; i <= tcp_hashinfo.lhash2_mask; i++) { struct inet_listen_hashbucket *ilb; struct hlist_nulls_node *node; struct sock *sk; int num = 0; - ilb = &tcp_hashinfo.listening_hash[i]; + ilb = &tcp_hashinfo.lhash2[i]; rcu_read_lock(); spin_lock(&ilb->lock); -- cgit v1.2.3 From ec8cb4f617a23700d37018d249e3b05149d44a38 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 11 May 2022 17:06:11 -0700 Subject: net: selftests: Stress reuseport listen This patch adds a test that has 300 VIPs listening on port 443. Each VIP:443 will have 80 listening socks by using SO_REUSEPORT. Thus, it will have 24000 listening socks. Before removing the port only listening_hash, all socks will be in the same port 443 bucket and inet_reuseport_add_sock() spends much time to walk through the bucket. After removing the port only listening_hash and move all usage to the port+addr lhash2, each bucket in the ideal case has 80 sk which is much smaller than before. Here is the test result from a qemu: Before: listen 24000 socks took 210.210485362 (~210s) After: listen 24000 socks took 0.207173 (~210ms) Signed-off-by: Martin KaFai Lau Reviewed-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/Makefile | 2 + .../selftests/net/stress_reuseport_listen.c | 105 +++++++++++++++++++++ .../selftests/net/stress_reuseport_listen.sh | 25 +++++ 3 files changed, 132 insertions(+) create mode 100644 tools/testing/selftests/net/stress_reuseport_listen.c create mode 100755 tools/testing/selftests/net/stress_reuseport_listen.sh diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 811e9b712bea..7ea54af55490 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -38,6 +38,7 @@ TEST_PROGS += srv6_end_dt6_l3vpn_test.sh TEST_PROGS += vrf_strict_mode_test.sh TEST_PROGS += arp_ndisc_evict_nocarrier.sh TEST_PROGS += ndisc_unsolicited_na_test.sh +TEST_PROGS += stress_reuseport_listen.sh TEST_PROGS_EXTENDED := in_netns.sh setup_loopback.sh setup_veth.sh TEST_PROGS_EXTENDED += toeplitz_client.sh toeplitz.sh TEST_GEN_FILES = socket nettest @@ -56,6 +57,7 @@ TEST_GEN_PROGS = reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa TEST_GEN_PROGS += reuseport_dualstack reuseaddr_conflict tls TEST_GEN_FILES += toeplitz TEST_GEN_FILES += cmsg_sender +TEST_GEN_FILES += stress_reuseport_listen TEST_PROGS += test_vxlan_vnifiltering.sh TEST_FILES := settings diff --git a/tools/testing/selftests/net/stress_reuseport_listen.c b/tools/testing/selftests/net/stress_reuseport_listen.c new file mode 100644 index 000000000000..ef800bb35a8e --- /dev/null +++ b/tools/testing/selftests/net/stress_reuseport_listen.c @@ -0,0 +1,105 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ + +/* Test listening on the same port 443 with multiple VIPS. + * Each VIP:443 will have multiple sk listening on by using + * SO_REUSEPORT. + */ + +#include +#include +#include +#include +#include +#include +#include + +#define IP6_LADDR_START "2401:dead::1" +#define IP6_LPORT 443 +#define NSEC_PER_SEC 1000000000L +#define NSEC_PER_USEC 1000L + +static unsigned int nr_socks_per_vip; +static unsigned int nr_vips; + +static int *bind_reuseport_sock6(void) +{ + int *lfds, *cur_fd, err, optvalue = 1; + struct sockaddr_in6 sa6 = {}; + unsigned int i, j; + + sa6.sin6_family = AF_INET6; + sa6.sin6_port = htons(IP6_LPORT); + err = inet_pton(AF_INET6, IP6_LADDR_START, &sa6.sin6_addr); + if (err != 1) + error(1, err, "inet_pton(%s)", IP6_LADDR_START); + + lfds = malloc(nr_vips * nr_socks_per_vip * sizeof(lfds[0])); + if (!lfds) + error(1, errno, "cannot alloc array of lfds"); + + cur_fd = lfds; + for (i = 0; i < nr_vips; i++) { + for (j = 0; j < nr_socks_per_vip; j++) { + *cur_fd = socket(AF_INET6, SOCK_STREAM, 0); + if (*cur_fd == -1) + error(1, errno, + "lfds[%u,%u] = socket(AF_INET6)", i, j); + + err = setsockopt(*cur_fd, SOL_SOCKET, SO_REUSEPORT, + &optvalue, sizeof(optvalue)); + if (err) + error(1, errno, + "setsockopt(lfds[%u,%u], SO_REUSEPORT)", + i, j); + + err = bind(*cur_fd, (struct sockaddr *)&sa6, + sizeof(sa6)); + if (err) + error(1, errno, "bind(lfds[%u,%u])", i, j); + cur_fd++; + } + sa6.sin6_addr.s6_addr32[3]++; + } + + return lfds; +} + +int main(int argc, const char *argv[]) +{ + struct timespec start_ts, end_ts; + unsigned long start_ns, end_ns; + unsigned int nr_lsocks; + int *lfds, i, err; + + if (argc != 3 || atoi(argv[1]) <= 0 || atoi(argv[2]) <= 0) + error(1, 0, "Usage: %s \n", + argv[0]); + + nr_vips = atoi(argv[1]); + nr_socks_per_vip = atoi(argv[2]); + nr_lsocks = nr_vips * nr_socks_per_vip; + lfds = bind_reuseport_sock6(); + + clock_gettime(CLOCK_MONOTONIC, &start_ts); + for (i = 0; i < nr_lsocks; i++) { + err = listen(lfds[i], 0); + if (err) + error(1, errno, "listen(lfds[%d])", i); + } + clock_gettime(CLOCK_MONOTONIC, &end_ts); + + start_ns = start_ts.tv_sec * NSEC_PER_SEC + start_ts.tv_nsec; + end_ns = end_ts.tv_sec * NSEC_PER_SEC + end_ts.tv_nsec; + + printf("listen %d socks took %lu.%lu\n", nr_lsocks, + (end_ns - start_ns) / NSEC_PER_SEC, + (end_ns - start_ns) / NSEC_PER_USEC); + + for (i = 0; i < nr_lsocks; i++) + close(lfds[i]); + + free(lfds); + return 0; +} diff --git a/tools/testing/selftests/net/stress_reuseport_listen.sh b/tools/testing/selftests/net/stress_reuseport_listen.sh new file mode 100755 index 000000000000..4de11da4092b --- /dev/null +++ b/tools/testing/selftests/net/stress_reuseport_listen.sh @@ -0,0 +1,25 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + +NS='stress_reuseport_listen_ns' +NR_FILES=24100 +SAVED_NR_FILES=$(ulimit -n) + +setup() { + ip netns add $NS + ip netns exec $NS sysctl -q -w net.ipv6.ip_nonlocal_bind=1 + ulimit -n $NR_FILES +} + +cleanup() { + ip netns del $NS + ulimit -n $SAVED_NR_FILES +} + +trap cleanup EXIT +setup +# 300 different vips listen on port 443 +# Each vip:443 sockaddr has 80 LISTEN sock by using SO_REUSEPORT +# Total 24000 listening socks +ip netns exec $NS ./stress_reuseport_listen 300 80 -- cgit v1.2.3