From 789f558cfb3680aeb52de137418637f6b04b7d22 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 12 Apr 2015 18:51:09 -0700 Subject: tcp/dccp: get rid of central timewait timer Using a timer wheel for timewait sockets was nice ~15 years ago when memory was expensive and machines had a single processor. This does not scale, code is ugly and source of huge latencies (Typically 30 ms have been seen, cpus spinning on death_lock spinlock.) We can afford to use an extra 64 bytes per timewait sock and spread timewait load to all cpus to have better behavior. Tested: On following test, /proc/sys/net/ipv4/tcp_tw_recycle is set to 1 on the target (lpaa24) Before patch : lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0 419594 lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0 437171 While test is running, we can observe 25 or even 33 ms latencies. lpaa24:~# ping -c 1000 -i 0.02 -qn lpaa23 ... 1000 packets transmitted, 1000 received, 0% packet loss, time 20601ms rtt min/avg/max/mdev = 0.020/0.217/25.771/1.535 ms, pipe 2 lpaa24:~# ping -c 1000 -i 0.02 -qn lpaa23 ... 1000 packets transmitted, 1000 received, 0% packet loss, time 20702ms rtt min/avg/max/mdev = 0.019/0.183/33.761/1.441 ms, pipe 2 After patch : About 90% increase of throughput : lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0 810442 lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0 800992 And latencies are kept to minimal values during this load, even if network utilization is 90% higher : lpaa24:~# ping -c 1000 -i 0.02 -qn lpaa23 ... 1000 packets transmitted, 1000 received, 0% packet loss, time 19991ms rtt min/avg/max/mdev = 0.023/0.064/0.360/0.042 ms Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/inet_diag.c | 4 +- net/ipv4/inet_hashtables.c | 4 +- net/ipv4/inet_timewait_sock.c | 270 ++++++------------------------------------ net/ipv4/proc.c | 2 +- net/ipv4/tcp_ipv4.c | 4 +- net/ipv4/tcp_minisocks.c | 35 ++---- 6 files changed, 52 insertions(+), 267 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 76322c9867d5..70e8b3c308ec 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -248,7 +248,7 @@ static int inet_twsk_diag_fill(struct sock *sk, struct inet_timewait_sock *tw = inet_twsk(sk); struct inet_diag_msg *r; struct nlmsghdr *nlh; - s32 tmo; + long tmo; nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r), nlmsg_flags); @@ -258,7 +258,7 @@ static int inet_twsk_diag_fill(struct sock *sk, r = nlmsg_data(nlh); BUG_ON(tw->tw_state != TCP_TIME_WAIT); - tmo = tw->tw_ttd - inet_tw_time_stamp(); + tmo = tw->tw_timer.expires - jiffies; if (tmo < 0) tmo = 0; diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index d4630bf2d9aa..c6fb80bd5826 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -388,7 +388,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, *twp = tw; } else if (tw) { /* Silly. Should hash-dance instead... */ - inet_twsk_deschedule(tw, death_row); + inet_twsk_deschedule(tw); inet_twsk_put(tw); } @@ -565,7 +565,7 @@ ok: spin_unlock(&head->lock); if (tw) { - inet_twsk_deschedule(tw, death_row); + inet_twsk_deschedule(tw); while (twrefcnt) { twrefcnt--; inet_twsk_put(tw); diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 118f0f195820..00ec8d5d7e7e 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -67,9 +67,9 @@ int inet_twsk_bind_unhash(struct inet_timewait_sock *tw, } /* Must be called with locally disabled BHs. */ -static void __inet_twsk_kill(struct inet_timewait_sock *tw, - struct inet_hashinfo *hashinfo) +static void inet_twsk_kill(struct inet_timewait_sock *tw) { + struct inet_hashinfo *hashinfo = tw->tw_dr->hashinfo; struct inet_bind_hashbucket *bhead; int refcnt; /* Unlink from established hashes. */ @@ -89,6 +89,8 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw, BUG_ON(refcnt >= atomic_read(&tw->tw_refcnt)); atomic_sub(refcnt, &tw->tw_refcnt); + atomic_dec(&tw->tw_dr->tw_count); + inet_twsk_put(tw); } void inet_twsk_free(struct inet_timewait_sock *tw) @@ -168,16 +170,34 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, } EXPORT_SYMBOL_GPL(__inet_twsk_hashdance); -struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state) +void tw_timer_handler(unsigned long data) { - struct inet_timewait_sock *tw = - kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab, - GFP_ATOMIC); + struct inet_timewait_sock *tw = (struct inet_timewait_sock *)data; + + if (tw->tw_kill) + NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITKILLED); + else + NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITED); + inet_twsk_kill(tw); +} + +struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, + struct inet_timewait_death_row *dr, + const int state) +{ + struct inet_timewait_sock *tw; + + if (atomic_read(&dr->tw_count) >= dr->sysctl_max_tw_buckets) + return NULL; + + tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab, + GFP_ATOMIC); if (tw) { const struct inet_sock *inet = inet_sk(sk); kmemcheck_annotate_bitfield(tw, flags); + tw->tw_dr = dr; /* Give us an identity. */ tw->tw_daddr = inet->inet_daddr; tw->tw_rcv_saddr = inet->inet_rcv_saddr; @@ -196,13 +216,14 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat tw->tw_prot = sk->sk_prot_creator; atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie)); twsk_net_set(tw, sock_net(sk)); + setup_timer(&tw->tw_timer, tw_timer_handler, (unsigned long)tw); /* * Because we use RCU lookups, we should not set tw_refcnt * to a non null value before everything is setup for this * timewait socket. */ atomic_set(&tw->tw_refcnt, 0); - inet_twsk_dead_node_init(tw); + __module_get(tw->tw_prot->owner); } @@ -210,139 +231,20 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat } EXPORT_SYMBOL_GPL(inet_twsk_alloc); -/* Returns non-zero if quota exceeded. */ -static int inet_twdr_do_twkill_work(struct inet_timewait_death_row *twdr, - const int slot) -{ - struct inet_timewait_sock *tw; - unsigned int killed; - int ret; - - /* NOTE: compare this to previous version where lock - * was released after detaching chain. It was racy, - * because tw buckets are scheduled in not serialized context - * in 2.3 (with netfilter), and with softnet it is common, because - * soft irqs are not sequenced. - */ - killed = 0; - ret = 0; -rescan: - inet_twsk_for_each_inmate(tw, &twdr->cells[slot]) { - __inet_twsk_del_dead_node(tw); - spin_unlock(&twdr->death_lock); - __inet_twsk_kill(tw, twdr->hashinfo); -#ifdef CONFIG_NET_NS - NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITED); -#endif - inet_twsk_put(tw); - killed++; - spin_lock(&twdr->death_lock); - if (killed > INET_TWDR_TWKILL_QUOTA) { - ret = 1; - break; - } - - /* While we dropped twdr->death_lock, another cpu may have - * killed off the next TW bucket in the list, therefore - * do a fresh re-read of the hlist head node with the - * lock reacquired. We still use the hlist traversal - * macro in order to get the prefetches. - */ - goto rescan; - } - - twdr->tw_count -= killed; -#ifndef CONFIG_NET_NS - NET_ADD_STATS_BH(&init_net, LINUX_MIB_TIMEWAITED, killed); -#endif - return ret; -} - -void inet_twdr_hangman(unsigned long data) -{ - struct inet_timewait_death_row *twdr; - unsigned int need_timer; - - twdr = (struct inet_timewait_death_row *)data; - spin_lock(&twdr->death_lock); - - if (twdr->tw_count == 0) - goto out; - - need_timer = 0; - if (inet_twdr_do_twkill_work(twdr, twdr->slot)) { - twdr->thread_slots |= (1 << twdr->slot); - schedule_work(&twdr->twkill_work); - need_timer = 1; - } else { - /* We purged the entire slot, anything left? */ - if (twdr->tw_count) - need_timer = 1; - twdr->slot = ((twdr->slot + 1) & (INET_TWDR_TWKILL_SLOTS - 1)); - } - if (need_timer) - mod_timer(&twdr->tw_timer, jiffies + twdr->period); -out: - spin_unlock(&twdr->death_lock); -} -EXPORT_SYMBOL_GPL(inet_twdr_hangman); - -void inet_twdr_twkill_work(struct work_struct *work) -{ - struct inet_timewait_death_row *twdr = - container_of(work, struct inet_timewait_death_row, twkill_work); - int i; - - BUILD_BUG_ON((INET_TWDR_TWKILL_SLOTS - 1) > - (sizeof(twdr->thread_slots) * 8)); - - while (twdr->thread_slots) { - spin_lock_bh(&twdr->death_lock); - for (i = 0; i < INET_TWDR_TWKILL_SLOTS; i++) { - if (!(twdr->thread_slots & (1 << i))) - continue; - - while (inet_twdr_do_twkill_work(twdr, i) != 0) { - if (need_resched()) { - spin_unlock_bh(&twdr->death_lock); - schedule(); - spin_lock_bh(&twdr->death_lock); - } - } - - twdr->thread_slots &= ~(1 << i); - } - spin_unlock_bh(&twdr->death_lock); - } -} -EXPORT_SYMBOL_GPL(inet_twdr_twkill_work); - /* These are always called from BH context. See callers in * tcp_input.c to verify this. */ /* This is for handling early-kills of TIME_WAIT sockets. */ -void inet_twsk_deschedule(struct inet_timewait_sock *tw, - struct inet_timewait_death_row *twdr) +void inet_twsk_deschedule(struct inet_timewait_sock *tw) { - spin_lock(&twdr->death_lock); - if (inet_twsk_del_dead_node(tw)) { - inet_twsk_put(tw); - if (--twdr->tw_count == 0) - del_timer(&twdr->tw_timer); - } - spin_unlock(&twdr->death_lock); - __inet_twsk_kill(tw, twdr->hashinfo); + if (del_timer_sync(&tw->tw_timer)) + inet_twsk_kill(tw); } EXPORT_SYMBOL(inet_twsk_deschedule); -void inet_twsk_schedule(struct inet_timewait_sock *tw, - struct inet_timewait_death_row *twdr, - const int timeo, const int timewait_len) +void inet_twsk_schedule(struct inet_timewait_sock *tw, const int timeo) { - struct hlist_head *list; - int slot; - /* timeout := RTO * 3.5 * * 3.5 = 1+2+0.5 to wait for two retransmits. @@ -367,115 +269,15 @@ void inet_twsk_schedule(struct inet_timewait_sock *tw, * is greater than TS tick!) and detect old duplicates with help * of PAWS. */ - slot = (timeo + (1 << INET_TWDR_RECYCLE_TICK) - 1) >> INET_TWDR_RECYCLE_TICK; - spin_lock(&twdr->death_lock); - - /* Unlink it, if it was scheduled */ - if (inet_twsk_del_dead_node(tw)) - twdr->tw_count--; - else + tw->tw_kill = timeo <= 4*HZ; + if (!mod_timer_pinned(&tw->tw_timer, jiffies + timeo)) { atomic_inc(&tw->tw_refcnt); - - if (slot >= INET_TWDR_RECYCLE_SLOTS) { - /* Schedule to slow timer */ - if (timeo >= timewait_len) { - slot = INET_TWDR_TWKILL_SLOTS - 1; - } else { - slot = DIV_ROUND_UP(timeo, twdr->period); - if (slot >= INET_TWDR_TWKILL_SLOTS) - slot = INET_TWDR_TWKILL_SLOTS - 1; - } - tw->tw_ttd = inet_tw_time_stamp() + timeo; - slot = (twdr->slot + slot) & (INET_TWDR_TWKILL_SLOTS - 1); - list = &twdr->cells[slot]; - } else { - tw->tw_ttd = inet_tw_time_stamp() + (slot << INET_TWDR_RECYCLE_TICK); - - if (twdr->twcal_hand < 0) { - twdr->twcal_hand = 0; - twdr->twcal_jiffie = jiffies; - twdr->twcal_timer.expires = twdr->twcal_jiffie + - (slot << INET_TWDR_RECYCLE_TICK); - add_timer(&twdr->twcal_timer); - } else { - if (time_after(twdr->twcal_timer.expires, - jiffies + (slot << INET_TWDR_RECYCLE_TICK))) - mod_timer(&twdr->twcal_timer, - jiffies + (slot << INET_TWDR_RECYCLE_TICK)); - slot = (twdr->twcal_hand + slot) & (INET_TWDR_RECYCLE_SLOTS - 1); - } - list = &twdr->twcal_row[slot]; + atomic_inc(&tw->tw_dr->tw_count); } - - hlist_add_head(&tw->tw_death_node, list); - - if (twdr->tw_count++ == 0) - mod_timer(&twdr->tw_timer, jiffies + twdr->period); - spin_unlock(&twdr->death_lock); } EXPORT_SYMBOL_GPL(inet_twsk_schedule); -void inet_twdr_twcal_tick(unsigned long data) -{ - struct inet_timewait_death_row *twdr; - int n, slot; - unsigned long j; - unsigned long now = jiffies; - int killed = 0; - int adv = 0; - - twdr = (struct inet_timewait_death_row *)data; - - spin_lock(&twdr->death_lock); - if (twdr->twcal_hand < 0) - goto out; - - slot = twdr->twcal_hand; - j = twdr->twcal_jiffie; - - for (n = 0; n < INET_TWDR_RECYCLE_SLOTS; n++) { - if (time_before_eq(j, now)) { - struct hlist_node *safe; - struct inet_timewait_sock *tw; - - inet_twsk_for_each_inmate_safe(tw, safe, - &twdr->twcal_row[slot]) { - __inet_twsk_del_dead_node(tw); - __inet_twsk_kill(tw, twdr->hashinfo); -#ifdef CONFIG_NET_NS - NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITKILLED); -#endif - inet_twsk_put(tw); - killed++; - } - } else { - if (!adv) { - adv = 1; - twdr->twcal_jiffie = j; - twdr->twcal_hand = slot; - } - - if (!hlist_empty(&twdr->twcal_row[slot])) { - mod_timer(&twdr->twcal_timer, j); - goto out; - } - } - j += 1 << INET_TWDR_RECYCLE_TICK; - slot = (slot + 1) & (INET_TWDR_RECYCLE_SLOTS - 1); - } - twdr->twcal_hand = -1; - -out: - if ((twdr->tw_count -= killed) == 0) - del_timer(&twdr->tw_timer); -#ifndef CONFIG_NET_NS - NET_ADD_STATS_BH(&init_net, LINUX_MIB_TIMEWAITKILLED, killed); -#endif - spin_unlock(&twdr->death_lock); -} -EXPORT_SYMBOL_GPL(inet_twdr_twcal_tick); - void inet_twsk_purge(struct inet_hashinfo *hashinfo, struct inet_timewait_death_row *twdr, int family) { @@ -509,7 +311,7 @@ restart: rcu_read_unlock(); local_bh_disable(); - inet_twsk_deschedule(tw, twdr); + inet_twsk_deschedule(tw); local_bh_enable(); inet_twsk_put(tw); goto restart_rcu; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index d8953ef0770c..e1f3b911dd1e 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -63,7 +63,7 @@ static int sockstat_seq_show(struct seq_file *seq, void *v) socket_seq_show(seq); seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n", sock_prot_inuse_get(net, &tcp_prot), orphans, - tcp_death_row.tw_count, sockets, + atomic_read(&tcp_death_row.tw_count), sockets, proto_memory_allocated(&tcp_prot)); seq_printf(seq, "UDP: inuse %d mem %ld\n", sock_prot_inuse_get(net, &udp_prot), diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 37578d52897e..3571f2be4470 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1685,7 +1685,7 @@ do_time_wait: iph->daddr, th->dest, inet_iif(skb)); if (sk2) { - inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row); + inet_twsk_deschedule(inet_twsk(sk)); inet_twsk_put(inet_twsk(sk)); sk = sk2; goto process; @@ -2242,9 +2242,9 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) static void get_timewait4_sock(const struct inet_timewait_sock *tw, struct seq_file *f, int i) { + long delta = tw->tw_timer.expires - jiffies; __be32 dest, src; __u16 destp, srcp; - s32 delta = tw->tw_ttd - inet_tw_time_stamp(); dest = tw->tw_daddr; src = tw->tw_rcv_saddr; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 2088fdcca141..63d6311b5365 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -34,18 +34,7 @@ int sysctl_tcp_abort_on_overflow __read_mostly; struct inet_timewait_death_row tcp_death_row = { .sysctl_max_tw_buckets = NR_FILE * 2, - .period = TCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS, - .death_lock = __SPIN_LOCK_UNLOCKED(tcp_death_row.death_lock), .hashinfo = &tcp_hashinfo, - .tw_timer = TIMER_INITIALIZER(inet_twdr_hangman, 0, - (unsigned long)&tcp_death_row), - .twkill_work = __WORK_INITIALIZER(tcp_death_row.twkill_work, - inet_twdr_twkill_work), -/* Short-time timewait calendar */ - - .twcal_hand = -1, - .twcal_timer = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0, - (unsigned long)&tcp_death_row), }; EXPORT_SYMBOL_GPL(tcp_death_row); @@ -158,7 +147,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, if (!th->fin || TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) { kill_with_rst: - inet_twsk_deschedule(tw, &tcp_death_row); + inet_twsk_deschedule(tw); inet_twsk_put(tw); return TCP_TW_RST; } @@ -174,11 +163,9 @@ kill_with_rst: if (tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp && tcp_tw_remember_stamp(tw)) - inet_twsk_schedule(tw, &tcp_death_row, tw->tw_timeout, - TCP_TIMEWAIT_LEN); + inet_twsk_schedule(tw, tw->tw_timeout); else - inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN, - TCP_TIMEWAIT_LEN); + inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN); return TCP_TW_ACK; } @@ -211,13 +198,12 @@ kill_with_rst: */ if (sysctl_tcp_rfc1337 == 0) { kill: - inet_twsk_deschedule(tw, &tcp_death_row); + inet_twsk_deschedule(tw); inet_twsk_put(tw); return TCP_TW_SUCCESS; } } - inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN, - TCP_TIMEWAIT_LEN); + inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN); if (tmp_opt.saw_tstamp) { tcptw->tw_ts_recent = tmp_opt.rcv_tsval; @@ -267,8 +253,7 @@ kill: * Do not reschedule in the last case. */ if (paws_reject || th->ack) - inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN, - TCP_TIMEWAIT_LEN); + inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN); return tcp_timewait_check_oow_rate_limit( tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT); @@ -283,16 +268,15 @@ EXPORT_SYMBOL(tcp_timewait_state_process); */ void tcp_time_wait(struct sock *sk, int state, int timeo) { - struct inet_timewait_sock *tw = NULL; const struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_sock *tp = tcp_sk(sk); + struct inet_timewait_sock *tw; bool recycle_ok = false; if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) recycle_ok = tcp_remember_stamp(sk); - if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets) - tw = inet_twsk_alloc(sk, state); + tw = inet_twsk_alloc(sk, &tcp_death_row, state); if (tw) { struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); @@ -355,8 +339,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) timeo = TCP_TIMEWAIT_LEN; } - inet_twsk_schedule(tw, &tcp_death_row, timeo, - TCP_TIMEWAIT_LEN); + inet_twsk_schedule(tw, timeo); inet_twsk_put(tw); } else { /* Sorry, if we're out of memory, just CLOSE this -- cgit v1.2.3