From efeaa5550e4bfd335396415958fe3615530e5d5c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 3 May 2013 19:12:45 +0000 Subject: tcp: do not expire TCP fastopen cookies TCP metric cache expires entries after one hour. This probably make sense for TCP RTT/RTTVAR/CWND, but not for TCP fastopen cookies. Its better to try previous cookie. If it appears to be obsolete, server will send us new cookie anyway. Signed-off-by: Eric Dumazet Cc: Yuchung Cheng Cc: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_metrics.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index f696d7c2e9fa..f6a005c485a9 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -96,7 +96,8 @@ struct tcpm_hash_bucket { static DEFINE_SPINLOCK(tcp_metrics_lock); -static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst) +static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst, + bool fastopen_clear) { u32 val; @@ -122,9 +123,11 @@ static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst) tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING); tm->tcpm_ts = 0; tm->tcpm_ts_stamp = 0; - tm->tcpm_fastopen.mss = 0; - tm->tcpm_fastopen.syn_loss = 0; - tm->tcpm_fastopen.cookie.len = 0; + if (fastopen_clear) { + tm->tcpm_fastopen.mss = 0; + tm->tcpm_fastopen.syn_loss = 0; + tm->tcpm_fastopen.cookie.len = 0; + } } static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst, @@ -154,7 +157,7 @@ static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst, } tm->tcpm_addr = *addr; - tcpm_suck_dst(tm, dst); + tcpm_suck_dst(tm, dst, true); if (likely(!reclaim)) { tm->tcpm_next = net->ipv4.tcp_metrics_hash[hash].chain; @@ -171,7 +174,7 @@ out_unlock: static void tcpm_check_stamp(struct tcp_metrics_block *tm, struct dst_entry *dst) { if (tm && unlikely(time_after(jiffies, tm->tcpm_stamp + TCP_METRICS_TIMEOUT))) - tcpm_suck_dst(tm, dst); + tcpm_suck_dst(tm, dst, false); } #define TCP_METRICS_RECLAIM_DEPTH 5 -- cgit v1.2.3 From b56141ab34e2c3e2d7960cea12c20c99530c0c76 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Sun, 5 May 2013 04:56:22 +0000 Subject: net: frag, fix race conditions in LRU list maintenance This patch fixes race between inet_frag_lru_move() and inet_frag_lru_add() which was introduced in commit 3ef0eb0db4bf92c6d2510fe5c4dc51852746f206 ("net: frag, move LRU list maintenance outside of rwlock") One cpu already added new fragment queue into hash but not into LRU. Other cpu found it in hash and tries to move it to the end of LRU. This leads to NULL pointer dereference inside of list_move_tail(). Another possible race condition is between inet_frag_lru_move() and inet_frag_lru_del(): move can happens after deletion. This patch initializes LRU list head before adding fragment into hash and inet_frag_lru_move() doesn't touches it if it's empty. I saw this kernel oops two times in a couple of days. [119482.128853] BUG: unable to handle kernel NULL pointer dereference at (null) [119482.132693] IP: [] __list_del_entry+0x29/0xd0 [119482.136456] PGD 2148f6067 PUD 215ab9067 PMD 0 [119482.140221] Oops: 0000 [#1] SMP [119482.144008] Modules linked in: vfat msdos fat 8021q fuse nfsd auth_rpcgss nfs_acl nfs lockd sunrpc ppp_async ppp_generic bridge slhc stp llc w83627ehf hwmon_vid snd_hda_codec_hdmi snd_hda_codec_realtek kvm_amd k10temp kvm snd_hda_intel snd_hda_codec edac_core radeon snd_hwdep ath9k snd_pcm ath9k_common snd_page_alloc ath9k_hw snd_timer snd soundcore drm_kms_helper ath ttm r8169 mii [119482.152692] CPU 3 [119482.152721] Pid: 20, comm: ksoftirqd/3 Not tainted 3.9.0-zurg-00001-g9f95269 #132 To Be Filled By O.E.M. To Be Filled By O.E.M./RS880D [119482.161478] RIP: 0010:[] [] __list_del_entry+0x29/0xd0 [119482.166004] RSP: 0018:ffff880216d5db58 EFLAGS: 00010207 [119482.170568] RAX: 0000000000000000 RBX: ffff88020882b9c0 RCX: dead000000200200 [119482.175189] RDX: 0000000000000000 RSI: 0000000000000880 RDI: ffff88020882ba00 [119482.179860] RBP: ffff880216d5db58 R08: ffffffff8155c7f0 R09: 0000000000000014 [119482.184570] R10: 0000000000000000 R11: 0000000000000000 R12: ffff88020882ba00 [119482.189337] R13: ffffffff81c8d780 R14: ffff880204357f00 R15: 00000000000005a0 [119482.194140] FS: 00007f58124dc700(0000) GS:ffff88021fcc0000(0000) knlGS:0000000000000000 [119482.198928] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [119482.203711] CR2: 0000000000000000 CR3: 00000002155f0000 CR4: 00000000000007e0 [119482.208533] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [119482.213371] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 [119482.218221] Process ksoftirqd/3 (pid: 20, threadinfo ffff880216d5c000, task ffff880216d3a9a0) [119482.223113] Stack: [119482.228004] ffff880216d5dbd8 ffffffff8155dcda 0000000000000000 ffff000200000001 [119482.233038] ffff8802153c1f00 ffff880000289440 ffff880200000014 ffff88007bc72000 [119482.238083] 00000000000079d5 ffff88007bc72f44 ffffffff00000002 ffff880204357f00 [119482.243090] Call Trace: [119482.248009] [] ip_defrag+0x8fa/0xd10 [119482.252921] [] ipv4_conntrack_defrag+0x83/0xe0 [119482.257803] [] nf_iterate+0x8b/0xa0 [119482.262658] [] ? inet_del_offload+0x40/0x40 [119482.267527] [] nf_hook_slow+0x74/0x130 [119482.272412] [] ? inet_del_offload+0x40/0x40 [119482.277302] [] ip_rcv+0x268/0x320 [119482.282147] [] __netif_receive_skb_core+0x612/0x7e0 [119482.286998] [] __netif_receive_skb+0x18/0x60 [119482.291826] [] process_backlog+0xa0/0x160 [119482.296648] [] net_rx_action+0x139/0x220 [119482.301403] [] __do_softirq+0xe7/0x220 [119482.306103] [] run_ksoftirqd+0x28/0x40 [119482.310809] [] smpboot_thread_fn+0xff/0x1a0 [119482.315515] [] ? lg_local_lock_cpu+0x40/0x40 [119482.320219] [] kthread+0xc0/0xd0 [119482.324858] [] ? insert_kthread_work+0x40/0x40 [119482.329460] [] ret_from_fork+0x7c/0xb0 [119482.334057] [] ? insert_kthread_work+0x40/0x40 [119482.338661] Code: 00 00 55 48 8b 17 48 b9 00 01 10 00 00 00 ad de 48 8b 47 08 48 89 e5 48 39 ca 74 29 48 b9 00 02 20 00 00 00 ad de 48 39 c8 74 7a <4c> 8b 00 4c 39 c7 75 53 4c 8b 42 08 4c 39 c7 75 2b 48 89 42 08 [119482.343787] RIP [] __list_del_entry+0x29/0xd0 [119482.348675] RSP [119482.353493] CR2: 0000000000000000 Oops happened on this path: ip_defrag() -> ip_frag_queue() -> inet_frag_lru_move() -> list_move_tail() -> __list_del_entry() Signed-off-by: Konstantin Khlebnikov Cc: Jesper Dangaard Brouer Cc: Florian Westphal Cc: Eric Dumazet Cc: David S. Miller Acked-by: Florian Westphal Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- net/ipv4/inet_fragment.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index e97d66a1fdde..7e06641e36ae 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -305,6 +305,7 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, setup_timer(&q->timer, f->frag_expire, (unsigned long)q); spin_lock_init(&q->lock); atomic_set(&q->refcnt, 1); + INIT_LIST_HEAD(&q->lru_list); return q; } -- cgit v1.2.3 From 0020356355192cbaf6d315515e6c95bd09618c3b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 5 May 2013 16:03:46 +0000 Subject: fib_trie: no need to delay vfree() Now that vfree() can be called from interrupt contexts, there's no need to play games with schedule_work() to escape calling vfree() from RCU callbacks. Signed-off-by: Al Viro Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index ff06b7543d9f..49616fed9340 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -125,7 +125,6 @@ struct tnode { unsigned int empty_children; /* KEYLENGTH bits needed */ union { struct rcu_head rcu; - struct work_struct work; struct tnode *tnode_free; }; struct rt_trie_node __rcu *child[0]; @@ -383,12 +382,6 @@ static struct tnode *tnode_alloc(size_t size) return vzalloc(size); } -static void __tnode_vfree(struct work_struct *arg) -{ - struct tnode *tn = container_of(arg, struct tnode, work); - vfree(tn); -} - static void __tnode_free_rcu(struct rcu_head *head) { struct tnode *tn = container_of(head, struct tnode, rcu); @@ -397,10 +390,8 @@ static void __tnode_free_rcu(struct rcu_head *head) if (size <= PAGE_SIZE) kfree(tn); - else { - INIT_WORK(&tn->work, __tnode_vfree); - schedule_work(&tn->work); - } + else + vfree(tn); } static inline void tnode_free(struct tnode *tn) -- cgit v1.2.3 From 243198d09f535f5cd74114f1b779c3da25bc70c8 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 5 May 2013 16:05:55 +0000 Subject: rps_dev_flow_table_release(): no need to delay vfree() The same story as with fib_trie patch - vfree() from RCU callbacks is legitimate now. Signed-off-by: Al Viro Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) (limited to 'net') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 7427ab5e27d8..981fed397d1d 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -606,21 +606,11 @@ static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, return sprintf(buf, "%lu\n", val); } -static void rps_dev_flow_table_release_work(struct work_struct *work) -{ - struct rps_dev_flow_table *table = container_of(work, - struct rps_dev_flow_table, free_work); - - vfree(table); -} - static void rps_dev_flow_table_release(struct rcu_head *rcu) { struct rps_dev_flow_table *table = container_of(rcu, struct rps_dev_flow_table, rcu); - - INIT_WORK(&table->free_work, rps_dev_flow_table_release_work); - schedule_work(&table->free_work); + vfree(table); } static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, -- cgit v1.2.3 From a3dbbc2bab8d9a6e55fc0af3906d1dddbc0c531e Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 6 May 2013 02:15:13 +0000 Subject: netpoll: inverted down_trylock() test The return value is reversed from mutex_trylock(). Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- net/core/netpoll.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/netpoll.c b/net/core/netpoll.c index a5802a8b652f..cec074be8c43 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -206,7 +206,7 @@ static void netpoll_poll_dev(struct net_device *dev) * the dev_open/close paths use this to block netpoll activity * while changing device state */ - if (!down_trylock(&ni->dev_lock)) + if (down_trylock(&ni->dev_lock)) return; if (!netif_running(dev)) { -- cgit v1.2.3 From cb4b102f0ab29fcbaf945c6b1f85ef006cdb8edc Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 6 May 2013 08:28:41 +0000 Subject: tipc: add a bounds check in link_recv_changeover_msg() The bearer_id here comes from skb->data and it can be a number from 0 to 7. The problem is that the ->links[] array has only 2 elements so I have added a range check. Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- net/tipc/link.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/tipc/link.c b/net/tipc/link.c index daa6080a2a0c..3a6064b3d666 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -2306,8 +2306,11 @@ static int link_recv_changeover_msg(struct tipc_link **l_ptr, struct tipc_msg *tunnel_msg = buf_msg(tunnel_buf); u32 msg_typ = msg_type(tunnel_msg); u32 msg_count = msg_msgcnt(tunnel_msg); + u32 bearer_id = msg_bearer_id(tunnel_msg); - dest_link = (*l_ptr)->owner->links[msg_bearer_id(tunnel_msg)]; + if (bearer_id >= MAX_BEARERS) + goto exit; + dest_link = (*l_ptr)->owner->links[bearer_id]; if (!dest_link) goto exit; if (dest_link == *l_ptr) { -- cgit v1.2.3 From 6bf15191f666c5965d212561d7a5c7b78b808dfa Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 6 May 2013 09:31:17 +0000 Subject: tipc: potential divide by zero in tipc_link_recv_fragment() The worry here is that fragm_sz could be zero since it comes from skb->data. Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- net/tipc/link.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/tipc/link.c b/net/tipc/link.c index 3a6064b3d666..a80feee5197a 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -2524,14 +2524,16 @@ int tipc_link_recv_fragment(struct sk_buff **pending, struct sk_buff **fb, struct tipc_msg *imsg = (struct tipc_msg *)msg_data(fragm); u32 msg_sz = msg_size(imsg); u32 fragm_sz = msg_data_sz(fragm); - u32 exp_fragm_cnt = msg_sz/fragm_sz + !!(msg_sz % fragm_sz); + u32 exp_fragm_cnt; u32 max = TIPC_MAX_USER_MSG_SIZE + NAMED_H_SIZE; + if (msg_type(imsg) == TIPC_MCAST_MSG) max = TIPC_MAX_USER_MSG_SIZE + MCAST_H_SIZE; - if (msg_size(imsg) > max) { + if (fragm_sz == 0 || msg_size(imsg) > max) { kfree_skb(fbuf); return 0; } + exp_fragm_cnt = msg_sz / fragm_sz + !!(msg_sz % fragm_sz); pbuf = tipc_buf_acquire(msg_size(imsg)); if (pbuf != NULL) { pbuf->next = *pending; -- cgit v1.2.3