From 076bb0c82a44fbe46fe2c8527a5b5b64b69f679d Mon Sep 17 00:00:00 2001 From: Eliezer Tamir Date: Wed, 10 Jul 2013 17:13:17 +0300 Subject: net: rename include/net/ll_poll.h to include/net/busy_poll.h Rename the file and correct all the places where it is included. Signed-off-by: Eliezer Tamir Signed-off-by: David S. Miller --- net/core/datagram.c | 2 +- net/core/sock.c | 2 +- net/core/sysctl_net_core.c | 2 +- net/ipv4/tcp.c | 2 +- net/ipv4/tcp_ipv4.c | 2 +- net/ipv4/udp.c | 2 +- net/ipv6/tcp_ipv6.c | 2 +- net/ipv6/udp.c | 2 +- net/socket.c | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/core/datagram.c b/net/core/datagram.c index 6e9ab31e457e..8ab48cd89559 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -56,7 +56,7 @@ #include #include #include -#include +#include /* * Is a socket 'connection oriented' ? diff --git a/net/core/sock.c b/net/core/sock.c index ab06b719f5b1..9bfe83f4d670 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -139,7 +139,7 @@ #include #endif -#include +#include static DEFINE_MUTEX(proto_list_mutex); static LIST_HEAD(proto_list); diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index afc677eadd93..1a298cb3daee 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -19,7 +19,7 @@ #include #include #include -#include +#include static int one = 1; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 15cbfa94bd8e..5423223e93c2 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -279,7 +279,7 @@ #include #include -#include +#include int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 35675e46aff8..3a261b41a00c 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -75,7 +75,7 @@ #include #include #include -#include +#include #include #include diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 6b270e53c207..bcc0ff2c16da 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -109,7 +109,7 @@ #include #include #include -#include +#include #include "udp_impl.h" struct udp_table udp_table __read_mostly; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 5cffa5c3e6b8..345bd92d4ddb 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -63,7 +63,7 @@ #include #include #include -#include +#include #include diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index b6f31437a1f8..40e72034da07 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -46,7 +46,7 @@ #include #include #include -#include +#include #include #include diff --git a/net/socket.c b/net/socket.c index 45afa648364a..6a3e9a3f50ad 100644 --- a/net/socket.c +++ b/net/socket.c @@ -104,7 +104,7 @@ #include #include #include -#include +#include #ifdef CONFIG_NET_LL_RX_POLL unsigned int sysctl_net_ll_read __read_mostly; -- cgit v1.2.3 From 8b80cda536ea9bceec0364e897868a30ee13b992 Mon Sep 17 00:00:00 2001 From: Eliezer Tamir Date: Wed, 10 Jul 2013 17:13:26 +0300 Subject: net: rename ll methods to busy-poll Rename ndo_ll_poll to ndo_busy_poll. Rename sk_mark_ll to sk_mark_napi_id. Rename skb_mark_ll to skb_mark_napi_id. Correct all useres of these functions. Update comments and defines in include/net/busy_poll.h Signed-off-by: Eliezer Tamir Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 2 +- net/ipv4/udp.c | 2 +- net/ipv6/tcp_ipv6.c | 2 +- net/ipv6/udp.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 3a261b41a00c..b299da5ff499 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1994,7 +1994,7 @@ process: if (sk_filter(sk, skb)) goto discard_and_relse; - sk_mark_ll(sk, skb); + sk_mark_napi_id(sk, skb); skb->dev = NULL; bh_lock_sock_nested(sk); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index bcc0ff2c16da..a0d7151ffbd9 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1713,7 +1713,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, if (sk != NULL) { int ret; - sk_mark_ll(sk, skb); + sk_mark_napi_id(sk, skb); ret = udp_queue_rcv_skb(sk, skb); sock_put(sk); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 345bd92d4ddb..6e1649d58533 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1499,7 +1499,7 @@ process: if (sk_filter(sk, skb)) goto discard_and_relse; - sk_mark_ll(sk, skb); + sk_mark_napi_id(sk, skb); skb->dev = NULL; bh_lock_sock_nested(sk); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 40e72034da07..f4058150262b 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -844,7 +844,7 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, if (sk != NULL) { int ret; - sk_mark_ll(sk, skb); + sk_mark_napi_id(sk, skb); ret = udpv6_queue_rcv_skb(sk, skb); sock_put(sk); -- cgit v1.2.3 From 64b0dc517ea1b35d02565a779e6cb77ae9045685 Mon Sep 17 00:00:00 2001 From: Eliezer Tamir Date: Wed, 10 Jul 2013 17:13:36 +0300 Subject: net: rename busy poll socket op and globals Rename LL_SO to BUSY_POLL_SO Rename sysctl_net_ll_{read,poll} to sysctl_busy_{read,poll} Fix up users of these variables. Fix documentation for sysctl. a patch for the socket.7 man page will follow separately, because of limitations of my mail setup. Signed-off-by: Eliezer Tamir Signed-off-by: David S. Miller --- net/core/sock.c | 6 +++--- net/core/sysctl_net_core.c | 8 ++++---- net/socket.c | 4 ++-- 3 files changed, 9 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index 9bfe83f4d670..548d716c5f62 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -901,7 +901,7 @@ set_rcvbuf: break; #ifdef CONFIG_NET_LL_RX_POLL - case SO_LL: + case SO_BUSY_POLL: /* allow unprivileged users to decrease the value */ if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN)) ret = -EPERM; @@ -1171,7 +1171,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname, break; #ifdef CONFIG_NET_LL_RX_POLL - case SO_LL: + case SO_BUSY_POLL: v.val = sk->sk_ll_usec; break; #endif @@ -2294,7 +2294,7 @@ void sock_init_data(struct socket *sock, struct sock *sk) #ifdef CONFIG_NET_LL_RX_POLL sk->sk_napi_id = 0; - sk->sk_ll_usec = sysctl_net_ll_read; + sk->sk_ll_usec = sysctl_net_busy_read; #endif /* diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 1a298cb3daee..660968616637 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -300,15 +300,15 @@ static struct ctl_table net_core_table[] = { #endif /* CONFIG_NET_FLOW_LIMIT */ #ifdef CONFIG_NET_LL_RX_POLL { - .procname = "low_latency_poll", - .data = &sysctl_net_ll_poll, + .procname = "busy_poll", + .data = &sysctl_net_busy_poll, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec }, { - .procname = "low_latency_read", - .data = &sysctl_net_ll_read, + .procname = "busy_read", + .data = &sysctl_net_busy_read, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec diff --git a/net/socket.c b/net/socket.c index 6a3e9a3f50ad..829b460acb87 100644 --- a/net/socket.c +++ b/net/socket.c @@ -107,8 +107,8 @@ #include #ifdef CONFIG_NET_LL_RX_POLL -unsigned int sysctl_net_ll_read __read_mostly; -unsigned int sysctl_net_ll_poll __read_mostly; +unsigned int sysctl_net_busy_read __read_mostly; +unsigned int sysctl_net_busy_poll __read_mostly; #endif static int sock_no_open(struct inode *irrelevant, struct file *dontcare); -- cgit v1.2.3 From 1eb4f758286884e7566627164bca4c4a16952a83 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Wed, 10 Jul 2013 23:00:57 +0200 Subject: ipv6: in case of link failure remove route directly instead of letting it expire We could end up expiring a route which is part of an ecmp route set. Doing so would invalidate the rt->rt6i_nsiblings calculations and could provoke the following panic: [ 80.144667] ------------[ cut here ]------------ [ 80.145172] kernel BUG at net/ipv6/ip6_fib.c:733! [ 80.145172] invalid opcode: 0000 [#1] SMP [ 80.145172] Modules linked in: 8021q nf_conntrack_netbios_ns nf_conntrack_broadcast ipt_MASQUERADE ip6table_mangle ip6t_REJECT nf_conntrack_ipv6 nf_defrag_ipv6 iptable_nat nf_nat_ipv4 nf_nat iptable_mangle nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack ebtable_filter ebtables ip6table_filter ip6_tables +snd_hda_intel snd_hda_codec snd_hwdep snd_seq snd_seq_device snd_pcm snd_page_alloc snd_timer virtio_balloon snd soundcore i2c_piix4 i2c_core virtio_net virtio_blk [ 80.145172] CPU: 1 PID: 786 Comm: ping6 Not tainted 3.10.0+ #118 [ 80.145172] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [ 80.145172] task: ffff880117fa0000 ti: ffff880118770000 task.ti: ffff880118770000 [ 80.145172] RIP: 0010:[] [] fib6_add+0x75d/0x830 [ 80.145172] RSP: 0018:ffff880118771798 EFLAGS: 00010202 [ 80.145172] RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffff88011350e480 [ 80.145172] RDX: ffff88011350e238 RSI: 0000000000000004 RDI: ffff88011350f738 [ 80.145172] RBP: ffff880118771848 R08: ffff880117903280 R09: 0000000000000001 [ 80.145172] R10: 0000000000000000 R11: 0000000000000000 R12: ffff88011350f680 [ 80.145172] R13: ffff880117903280 R14: ffff880118771890 R15: ffff88011350ef90 [ 80.145172] FS: 00007f02b5127740(0000) GS:ffff88011fd00000(0000) knlGS:0000000000000000 [ 80.145172] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [ 80.145172] CR2: 00007f981322a000 CR3: 00000001181b1000 CR4: 00000000000006e0 [ 80.145172] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 80.145172] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 [ 80.145172] Stack: [ 80.145172] 0000000000000001 ffff880100000000 ffff880100000000 ffff880117903280 [ 80.145172] 0000000000000000 ffff880119a4cf00 0000000000000400 00000000000007fa [ 80.145172] 0000000000000000 0000000000000000 0000000000000000 ffff88011350f680 [ 80.145172] Call Trace: [ 80.145172] [] ? rt6_bind_peer+0x4b/0x90 [ 80.145172] [] __ip6_ins_rt+0x45/0x70 [ 80.145172] [] ip6_ins_rt+0x35/0x40 [ 80.145172] [] ip6_pol_route.isra.44+0x3a4/0x4b0 [ 80.145172] [] ip6_pol_route_output+0x2a/0x30 [ 80.145172] [] fib6_rule_action+0xd7/0x210 [ 80.145172] [] ? ip6_pol_route_input+0x30/0x30 [ 80.145172] [] fib_rules_lookup+0xc6/0x140 [ 80.145172] [] fib6_rule_lookup+0x44/0x80 [ 80.145172] [] ? ip6_pol_route_input+0x30/0x30 [ 80.145172] [] ip6_route_output+0x73/0xb0 [ 80.145172] [] ip6_dst_lookup_tail+0x2c3/0x2e0 [ 80.145172] [] ? list_del+0x11/0x40 [ 80.145172] [] ? remove_wait_queue+0x3c/0x50 [ 80.145172] [] ip6_dst_lookup_flow+0x3d/0xa0 [ 80.145172] [] rawv6_sendmsg+0x267/0xc20 [ 80.145172] [] inet_sendmsg+0x63/0xb0 [ 80.145172] [] ? selinux_socket_sendmsg+0x23/0x30 [ 80.145172] [] sock_sendmsg+0xa6/0xd0 [ 80.145172] [] SYSC_sendto+0x128/0x180 [ 80.145172] [] ? update_curr+0xec/0x170 [ 80.145172] [] ? kvm_clock_get_cycles+0x9/0x10 [ 80.145172] [] ? __getnstimeofday+0x3e/0xd0 [ 80.145172] [] SyS_sendto+0xe/0x10 [ 80.145172] [] system_call_fastpath+0x16/0x1b [ 80.145172] Code: fe ff ff 41 f6 45 2a 06 0f 85 ca fe ff ff 49 8b 7e 08 4c 89 ee e8 94 ef ff ff e9 b9 fe ff ff 48 8b 82 28 05 00 00 e9 01 ff ff ff <0f> 0b 49 8b 54 24 30 0d 00 00 40 00 89 83 14 01 00 00 48 89 53 [ 80.145172] RIP [] fib6_add+0x75d/0x830 [ 80.145172] RSP [ 80.387413] ---[ end trace 02f20b7a8b81ed95 ]--- [ 80.390154] Kernel panic - not syncing: Fatal exception in interrupt Cc: Nicolas Dichtel Cc: YOSHIFUJI Hideaki Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/ipv6/route.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index bd5fd7054031..5b127e09c224 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1080,10 +1080,13 @@ static void ip6_link_failure(struct sk_buff *skb) rt = (struct rt6_info *) skb_dst(skb); if (rt) { - if (rt->rt6i_flags & RTF_CACHE) - rt6_update_expires(rt, 0); - else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) + if (rt->rt6i_flags & RTF_CACHE) { + dst_hold(&rt->dst); + if (ip6_del_rt(rt)) + dst_free(&rt->dst); + } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) { rt->rt6i_node->fn_sernum = -1; + } } } -- cgit v1.2.3 From 110ecd69a9feea82a152bbf9b12aba57e6396883 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Thu, 11 Jul 2013 13:16:54 -0400 Subject: 9p: fix off by one causing access violations and memory corruption p9_release_pages() would attempt to dereference one value past the end of pages[]. This would cause the following crashes: [ 6293.171817] BUG: unable to handle kernel paging request at ffff8807c96f3000 [ 6293.174146] IP: [] p9_release_pages+0x3b/0x60 [ 6293.176447] PGD 79c5067 PUD 82c1e3067 PMD 82c197067 PTE 80000007c96f3060 [ 6293.180060] Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC [ 6293.180060] Modules linked in: [ 6293.180060] CPU: 62 PID: 174043 Comm: modprobe Tainted: G W 3.10.0-next-20130710-sasha #3954 [ 6293.180060] task: ffff8807b803b000 ti: ffff880787dde000 task.ti: ffff880787dde000 [ 6293.180060] RIP: 0010:[] [] p9_release_pages+0x3b/0x60 [ 6293.214316] RSP: 0000:ffff880787ddfc28 EFLAGS: 00010202 [ 6293.214316] RAX: 0000000000000001 RBX: ffff8807c96f2ff8 RCX: 0000000000000000 [ 6293.222017] RDX: ffff8807b803b000 RSI: 0000000000000001 RDI: ffffea001c7e3d40 [ 6293.222017] RBP: ffff880787ddfc48 R08: 0000000000000000 R09: 0000000000000000 [ 6293.222017] R10: 0000000000000001 R11: 0000000000000000 R12: 0000000000000001 [ 6293.222017] R13: 0000000000000001 R14: ffff8807cc50c070 R15: ffff8807cc50c070 [ 6293.222017] FS: 00007f572641d700(0000) GS:ffff8807f3600000(0000) knlGS:0000000000000000 [ 6293.256784] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [ 6293.256784] CR2: ffff8807c96f3000 CR3: 00000007c8e81000 CR4: 00000000000006e0 [ 6293.256784] Stack: [ 6293.256784] ffff880787ddfcc8 ffff880787ddfcc8 0000000000000000 ffff880787ddfcc8 [ 6293.256784] ffff880787ddfd48 ffffffff84128be8 ffff880700000002 0000000000000001 [ 6293.256784] ffff8807b803b000 ffff880787ddfce0 0000100000000000 0000000000000000 [ 6293.256784] Call Trace: [ 6293.256784] [] p9_virtio_zc_request+0x598/0x630 [ 6293.256784] [] ? wake_up_bit+0x40/0x40 [ 6293.256784] [] p9_client_zc_rpc+0x111/0x3a0 [ 6293.256784] [] ? sched_clock_cpu+0x108/0x120 [ 6293.256784] [] p9_client_read+0xe1/0x2c0 [ 6293.256784] [] v9fs_file_read+0x90/0xc0 [ 6293.256784] [] vfs_read+0xc3/0x130 [ 6293.256784] [] ? trace_hardirqs_on+0xd/0x10 [ 6293.256784] [] SyS_read+0x62/0xa0 [ 6293.256784] [] tracesys+0xdd/0xe2 [ 6293.256784] Code: 66 90 48 89 fb 41 89 f5 48 8b 3f 48 85 ff 74 29 85 f6 74 25 45 31 e4 66 0f 1f 84 00 00 00 00 00 e8 eb 14 12 fd 41 ff c4 49 63 c4 <48> 8b 3c c3 48 85 ff 74 05 45 39 e5 75 e7 48 83 c4 08 5b 41 5c [ 6293.256784] RIP [] p9_release_pages+0x3b/0x60 [ 6293.256784] RSP [ 6293.256784] CR2: ffff8807c96f3000 [ 6293.256784] ---[ end trace 50822ee72cd360fc ]--- Signed-off-by: Sasha Levin Signed-off-by: David S. Miller --- net/9p/trans_common.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/9p/trans_common.c b/net/9p/trans_common.c index de8df957867d..2ee3879161b1 100644 --- a/net/9p/trans_common.c +++ b/net/9p/trans_common.c @@ -24,11 +24,11 @@ */ void p9_release_pages(struct page **pages, int nr_pages) { - int i = 0; - while (pages[i] && nr_pages--) { - put_page(pages[i]); - i++; - } + int i; + + for (i = 0; i < nr_pages; i++) + if (pages[i]) + put_page(pages[i]); } EXPORT_SYMBOL(p9_release_pages); -- cgit v1.2.3 From afc154e978de1eb11c555bc8bcec1552f75ebc43 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Thu, 11 Jul 2013 12:43:42 +0200 Subject: ipv6: fix route selection if kernel is not compiled with CONFIG_IPV6_ROUTER_PREF This is a follow-up patch to 3630d40067a21d4dfbadc6002bb469ce26ac5d52 ("ipv6: rt6_check_neigh should successfully verify neigh if no NUD information are available"). Since the removal of rt->n in rt6_info we can end up with a dst == NULL in rt6_check_neigh. In case the kernel is not compiled with CONFIG_IPV6_ROUTER_PREF we should also select a route with unkown NUD state but we must not avoid doing round robin selection on routes with the same target. So introduce and pass down a boolean ``do_rr'' to indicate when we should update rt->rr_ptr. As soon as no route is valid we do backtracking and do a lookup on a higher level in the fib trie. v2: a) Improved rt6_check_neigh logic (no need to create neighbour there) and documented return values. v3: a) Introduce enum rt6_nud_state to get rid of the magic numbers (thanks to David Miller). b) Update and shorten commit message a bit to actualy reflect the source. Reported-by: Pierre Emeriaud Cc: YOSHIFUJI Hideaki Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/ipv6/route.c | 63 +++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 40 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 5b127e09c224..a8c891aa2464 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -65,6 +65,12 @@ #include #endif +enum rt6_nud_state { + RT6_NUD_FAIL_HARD = -2, + RT6_NUD_FAIL_SOFT = -1, + RT6_NUD_SUCCEED = 1 +}; + static struct rt6_info *ip6_rt_copy(struct rt6_info *ort, const struct in6_addr *dest); static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); @@ -531,28 +537,29 @@ static inline int rt6_check_dev(struct rt6_info *rt, int oif) return 0; } -static inline bool rt6_check_neigh(struct rt6_info *rt) +static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt) { struct neighbour *neigh; - bool ret = false; + enum rt6_nud_state ret = RT6_NUD_FAIL_HARD; if (rt->rt6i_flags & RTF_NONEXTHOP || !(rt->rt6i_flags & RTF_GATEWAY)) - return true; + return RT6_NUD_SUCCEED; rcu_read_lock_bh(); neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); if (neigh) { read_lock(&neigh->lock); if (neigh->nud_state & NUD_VALID) - ret = true; + ret = RT6_NUD_SUCCEED; #ifdef CONFIG_IPV6_ROUTER_PREF else if (!(neigh->nud_state & NUD_FAILED)) - ret = true; + ret = RT6_NUD_SUCCEED; #endif read_unlock(&neigh->lock); - } else if (IS_ENABLED(CONFIG_IPV6_ROUTER_PREF)) { - ret = true; + } else { + ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ? + RT6_NUD_SUCCEED : RT6_NUD_FAIL_SOFT; } rcu_read_unlock_bh(); @@ -566,43 +573,52 @@ static int rt6_score_route(struct rt6_info *rt, int oif, m = rt6_check_dev(rt, oif); if (!m && (strict & RT6_LOOKUP_F_IFACE)) - return -1; + return RT6_NUD_FAIL_HARD; #ifdef CONFIG_IPV6_ROUTER_PREF m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2; #endif - if (!rt6_check_neigh(rt) && (strict & RT6_LOOKUP_F_REACHABLE)) - return -1; + if (strict & RT6_LOOKUP_F_REACHABLE) { + int n = rt6_check_neigh(rt); + if (n < 0) + return n; + } return m; } static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict, - int *mpri, struct rt6_info *match) + int *mpri, struct rt6_info *match, + bool *do_rr) { int m; + bool match_do_rr = false; if (rt6_check_expired(rt)) goto out; m = rt6_score_route(rt, oif, strict); - if (m < 0) + if (m == RT6_NUD_FAIL_SOFT && !IS_ENABLED(CONFIG_IPV6_ROUTER_PREF)) { + match_do_rr = true; + m = 0; /* lowest valid score */ + } else if (m < 0) { goto out; + } + + if (strict & RT6_LOOKUP_F_REACHABLE) + rt6_probe(rt); if (m > *mpri) { - if (strict & RT6_LOOKUP_F_REACHABLE) - rt6_probe(match); + *do_rr = match_do_rr; *mpri = m; match = rt; - } else if (strict & RT6_LOOKUP_F_REACHABLE) { - rt6_probe(rt); } - out: return match; } static struct rt6_info *find_rr_leaf(struct fib6_node *fn, struct rt6_info *rr_head, - u32 metric, int oif, int strict) + u32 metric, int oif, int strict, + bool *do_rr) { struct rt6_info *rt, *match; int mpri = -1; @@ -610,10 +626,10 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn, match = NULL; for (rt = rr_head; rt && rt->rt6i_metric == metric; rt = rt->dst.rt6_next) - match = find_match(rt, oif, strict, &mpri, match); + match = find_match(rt, oif, strict, &mpri, match, do_rr); for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric; rt = rt->dst.rt6_next) - match = find_match(rt, oif, strict, &mpri, match); + match = find_match(rt, oif, strict, &mpri, match, do_rr); return match; } @@ -622,15 +638,16 @@ static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict) { struct rt6_info *match, *rt0; struct net *net; + bool do_rr = false; rt0 = fn->rr_ptr; if (!rt0) fn->rr_ptr = rt0 = fn->leaf; - match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict); + match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict, + &do_rr); - if (!match && - (strict & RT6_LOOKUP_F_REACHABLE)) { + if (do_rr) { struct rt6_info *next = rt0->dst.rt6_next; /* no entries matched; do round-robin */ -- cgit v1.2.3 From 3b8ccd447375acebed9af0a3798e1ab4e58bedf4 Mon Sep 17 00:00:00 2001 From: Camelia Groza Date: Thu, 11 Jul 2013 09:55:51 +0300 Subject: inet: fix spacing in assignment Found using checkpatch.pl Signed-off-by: Camelia Groza Signed-off-by: David S. Miller --- net/ipv4/inet_hashtables.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 6af375afeeef..7bd8983dbfcf 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -467,7 +467,7 @@ void inet_unhash(struct sock *sk) lock = inet_ehash_lockp(hashinfo, sk->sk_hash); spin_lock_bh(lock); - done =__sk_nulls_del_node_init_rcu(sk); + done = __sk_nulls_del_node_init_rcu(sk); if (done) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); spin_unlock_bh(lock); -- cgit v1.2.3 From cdbaa0bb26d8116d00be24e6b49043777b382f3a Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 10 Jul 2013 17:05:06 -0700 Subject: gso: Update tunnel segmentation to support Tx checksum offload This change makes it so that the GRE and VXLAN tunnels can make use of Tx checksum offload support provided by some drivers via the hw_enc_features. Without this fix enabling GSO means sacrificing Tx checksum offload and this actually leads to a performance regression as shown below: Utilization Send Throughput local GSO 10^6bits/s % S state 6276.51 8.39 enabled 7123.52 8.42 disabled To resolve this it was necessary to address two items. First netif_skb_features needed to be updated so that it would correctly handle the Trans Ether Bridging protocol without impacting the need to check for Q-in-Q tagging. To do this it was necessary to update harmonize_features so that it used skb_network_protocol instead of just using the outer protocol. Second it was necessary to update the GRE and UDP tunnel segmentation offloads so that they would reset the encapsulation bit and inner header offsets after the offload was complete. As a result of this change I have seen the following results on a interface with Tx checksum enabled for encapsulated frames: Utilization Send Throughput local GSO 10^6bits/s % S state 7123.52 8.42 disabled 8321.75 5.43 enabled v2: Instead of replacing refrence to skb->protocol with skb_network_protocol just replace the protocol reference in harmonize_features to allow for double VLAN tag checks. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- net/core/dev.c | 14 ++++++-------- net/ipv4/gre_offload.c | 3 +++ net/ipv4/udp.c | 4 +++- 3 files changed, 12 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 560dafd83adf..a3d8d44cb7f4 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2481,10 +2481,10 @@ static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features) } static netdev_features_t harmonize_features(struct sk_buff *skb, - __be16 protocol, netdev_features_t features) + netdev_features_t features) { if (skb->ip_summed != CHECKSUM_NONE && - !can_checksum_protocol(features, protocol)) { + !can_checksum_protocol(features, skb_network_protocol(skb))) { features &= ~NETIF_F_ALL_CSUM; } else if (illegal_highdma(skb->dev, skb)) { features &= ~NETIF_F_SG; @@ -2505,20 +2505,18 @@ netdev_features_t netif_skb_features(struct sk_buff *skb) struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; protocol = veh->h_vlan_encapsulated_proto; } else if (!vlan_tx_tag_present(skb)) { - return harmonize_features(skb, protocol, features); + return harmonize_features(skb, features); } features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX); - if (protocol != htons(ETH_P_8021Q) && protocol != htons(ETH_P_8021AD)) { - return harmonize_features(skb, protocol, features); - } else { + if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST | NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; - return harmonize_features(skb, protocol, features); - } + + return harmonize_features(skb, features); } EXPORT_SYMBOL(netif_skb_features); diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index 775d5b532ece..55e6bfb3a289 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -100,6 +100,9 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, } __skb_push(skb, tnl_hlen - ghl); + skb_reset_inner_headers(skb); + skb->encapsulation = 1; + skb_reset_mac_header(skb); skb_set_network_header(skb, mac_len); skb->mac_len = mac_len; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index a0d7151ffbd9..766e6bab9113 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2323,6 +2323,9 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, struct udphdr *uh; int udp_offset = outer_hlen - tnl_hlen; + skb_reset_inner_headers(skb); + skb->encapsulation = 1; + skb->mac_len = mac_len; skb_push(skb, outer_hlen); @@ -2345,7 +2348,6 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, uh->check = CSUM_MANGLED_0; } - skb->ip_summed = CHECKSUM_NONE; skb->protocol = protocol; } while ((skb = skb->next)); out: -- cgit v1.2.3 From 87f1369d6e2e820c77cf9eac542eed4dcf036f64 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Wed, 10 Jul 2013 15:46:08 +0200 Subject: pkt_sched: sch_qfq: improve efficiency of make_eligible In make_eligible, a mask is used to decide which groups must become eligible: the i-th group becomes eligible only if the i-th bit of the mask (from the right) is set. The mask is computed by left-shifting a 1 by a given number of places, and decrementing the result. The shift is performed on a ULL to avoid problems in case the number of places to shift is higher than 31. On a 32-bit machine, this is more costly than working on an UL. This patch replaces such a costly operation with two cheaper branches. The trick is based on the following fact: in case of a shift of at least 32 places, the resulting mask has at least the 32 less significant bits set, whereas the total number of groups is lower than 32. As a consequence, in this case it is enough to just set the 32 less significant bits of the mask with a cheaper ~0UL. In the other case, the shift can be safely performed on a UL. Reported-by: David S. Miller Reported-by: David Laight Signed-off-by: Paolo Valente Signed-off-by: David S. Miller --- net/sched/sch_qfq.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 7c195d972bf0..8d86a8b5522a 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -821,7 +821,14 @@ static void qfq_make_eligible(struct qfq_sched *q) unsigned long old_vslot = q->oldV >> q->min_slot_shift; if (vslot != old_vslot) { - unsigned long mask = (1ULL << fls(vslot ^ old_vslot)) - 1; + unsigned long mask; + int last_flip_pos = fls(vslot ^ old_vslot); + + if (last_flip_pos > 31) /* higher than the number of groups */ + mask = ~0UL; /* make all groups eligible */ + else + mask = (1UL << last_flip_pos) - 1; + qfq_move_groups(q, mask, IR, ER); qfq_move_groups(q, mask, IB, EB); } -- cgit v1.2.3 From 88d4f419a43b474a4524f41f55c36bee13416bdd Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Wed, 10 Jul 2013 15:46:09 +0200 Subject: pkt_sched: sch_qfq: remove forward declaration of qfq_update_agg_ts This patch removes the forward declaration of qfq_update_agg_ts, by moving the definition of the function above its first call. This patch also removes a useless forward declaration of qfq_schedule_agg. Reported-by: David S. Miller Signed-off-by: Paolo Valente Signed-off-by: David S. Miller --- net/sched/sch_qfq.c | 118 ++++++++++++++++++++++++---------------------------- 1 file changed, 55 insertions(+), 63 deletions(-) (limited to 'net') diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 8d86a8b5522a..a7ab323849b6 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -1010,9 +1010,61 @@ static inline void charge_actual_service(struct qfq_aggregate *agg) agg->F = agg->S + (u64)service_received * agg->inv_w; } -static inline void qfq_update_agg_ts(struct qfq_sched *q, - struct qfq_aggregate *agg, - enum update_reason reason); +/* Assign a reasonable start time for a new aggregate in group i. + * Admissible values for \hat(F) are multiples of \sigma_i + * no greater than V+\sigma_i . Larger values mean that + * we had a wraparound so we consider the timestamp to be stale. + * + * If F is not stale and F >= V then we set S = F. + * Otherwise we should assign S = V, but this may violate + * the ordering in EB (see [2]). So, if we have groups in ER, + * set S to the F_j of the first group j which would be blocking us. + * We are guaranteed not to move S backward because + * otherwise our group i would still be blocked. + */ +static void qfq_update_start(struct qfq_sched *q, struct qfq_aggregate *agg) +{ + unsigned long mask; + u64 limit, roundedF; + int slot_shift = agg->grp->slot_shift; + + roundedF = qfq_round_down(agg->F, slot_shift); + limit = qfq_round_down(q->V, slot_shift) + (1ULL << slot_shift); + + if (!qfq_gt(agg->F, q->V) || qfq_gt(roundedF, limit)) { + /* timestamp was stale */ + mask = mask_from(q->bitmaps[ER], agg->grp->index); + if (mask) { + struct qfq_group *next = qfq_ffs(q, mask); + if (qfq_gt(roundedF, next->F)) { + if (qfq_gt(limit, next->F)) + agg->S = next->F; + else /* preserve timestamp correctness */ + agg->S = limit; + return; + } + } + agg->S = q->V; + } else /* timestamp is not stale */ + agg->S = agg->F; +} + +/* Update the timestamps of agg before scheduling/rescheduling it for + * service. In particular, assign to agg->F its maximum possible + * value, i.e., the virtual finish time with which the aggregate + * should be labeled if it used all its budget once in service. + */ +static inline void +qfq_update_agg_ts(struct qfq_sched *q, + struct qfq_aggregate *agg, enum update_reason reason) +{ + if (reason != requeue) + qfq_update_start(q, agg); + else /* just charge agg for the service received */ + agg->S = agg->F; + + agg->F = agg->S + (u64)agg->budgetmax * agg->inv_w; +} static void qfq_schedule_agg(struct qfq_sched *q, struct qfq_aggregate *agg); @@ -1135,66 +1187,6 @@ static struct qfq_aggregate *qfq_choose_next_agg(struct qfq_sched *q) return agg; } -/* - * Assign a reasonable start time for a new aggregate in group i. - * Admissible values for \hat(F) are multiples of \sigma_i - * no greater than V+\sigma_i . Larger values mean that - * we had a wraparound so we consider the timestamp to be stale. - * - * If F is not stale and F >= V then we set S = F. - * Otherwise we should assign S = V, but this may violate - * the ordering in EB (see [2]). So, if we have groups in ER, - * set S to the F_j of the first group j which would be blocking us. - * We are guaranteed not to move S backward because - * otherwise our group i would still be blocked. - */ -static void qfq_update_start(struct qfq_sched *q, struct qfq_aggregate *agg) -{ - unsigned long mask; - u64 limit, roundedF; - int slot_shift = agg->grp->slot_shift; - - roundedF = qfq_round_down(agg->F, slot_shift); - limit = qfq_round_down(q->V, slot_shift) + (1ULL << slot_shift); - - if (!qfq_gt(agg->F, q->V) || qfq_gt(roundedF, limit)) { - /* timestamp was stale */ - mask = mask_from(q->bitmaps[ER], agg->grp->index); - if (mask) { - struct qfq_group *next = qfq_ffs(q, mask); - if (qfq_gt(roundedF, next->F)) { - if (qfq_gt(limit, next->F)) - agg->S = next->F; - else /* preserve timestamp correctness */ - agg->S = limit; - return; - } - } - agg->S = q->V; - } else /* timestamp is not stale */ - agg->S = agg->F; -} - -/* - * Update the timestamps of agg before scheduling/rescheduling it for - * service. In particular, assign to agg->F its maximum possible - * value, i.e., the virtual finish time with which the aggregate - * should be labeled if it used all its budget once in service. - */ -static inline void -qfq_update_agg_ts(struct qfq_sched *q, - struct qfq_aggregate *agg, enum update_reason reason) -{ - if (reason != requeue) - qfq_update_start(q, agg); - else /* just charge agg for the service received */ - agg->S = agg->F; - - agg->F = agg->S + (u64)agg->budgetmax * agg->inv_w; -} - -static void qfq_schedule_agg(struct qfq_sched *, struct qfq_aggregate *); - static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) { struct qfq_sched *q = qdisc_priv(sch); -- cgit v1.2.3 From 8c91e162e058bb91b7766f26f4d5823a21941026 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Thu, 11 Jul 2013 13:12:22 -0700 Subject: gre: Fix MTU sizing check for gretap tunnels This change fixes an MTU sizing issue seen with gretap tunnels when non-gso packets are sent from the interface. In my case I was able to reproduce the issue by simply sending a ping of 1421 bytes with the gretap interface created on a device with a standard 1500 mtu. This fix is based on the fact that the tunnel mtu is already adjusted by dev->hard_header_len so it would make sense that any packets being compared against that mtu should also be adjusted by hard_header_len and the tunnel header instead of just the tunnel header. Signed-off-by: Alexander Duyck Reported-by: Cong Wang Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ip_tunnel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 945734b2f209..ca1cb2d5f6e2 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -476,7 +476,7 @@ static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb, struct rtable *rt, __be16 df) { struct ip_tunnel *tunnel = netdev_priv(dev); - int pkt_size = skb->len - tunnel->hlen; + int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len; int mtu; if (df) -- cgit v1.2.3 From d77e41e12744e53ca7f98f920350998b5f00c93a Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 10 Jul 2013 17:30:34 +0300 Subject: net/tipc: use %*phC to dump small buffers in hex form Instead of passing each byte by stack let's use nice specifier for that. Signed-off-by: Andy Shevchenko Signed-off-by: David S. Miller --- net/tipc/ib_media.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'net') diff --git a/net/tipc/ib_media.c b/net/tipc/ib_media.c index ad2e1ec4117e..9934a32bfa87 100644 --- a/net/tipc/ib_media.c +++ b/net/tipc/ib_media.c @@ -292,13 +292,7 @@ static int ib_addr2str(struct tipc_media_addr *a, char *str_buf, int str_size) if (str_size < 60) /* 60 = 19 * strlen("xx:") + strlen("xx\0") */ return 1; - sprintf(str_buf, "%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:" - "%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x", - a->value[0], a->value[1], a->value[2], a->value[3], - a->value[4], a->value[5], a->value[6], a->value[7], - a->value[8], a->value[9], a->value[10], a->value[11], - a->value[12], a->value[13], a->value[14], a->value[15], - a->value[16], a->value[17], a->value[18], a->value[19]); + sprintf(str_buf, "%20phC", a->value); return 0; } -- cgit v1.2.3 From 92338dc2fb33c8526256a458a520af73d9ab2d14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CCosmin?= Date: Fri, 12 Jul 2013 09:33:33 +0300 Subject: net: strict_strtoul is obsolete, use kstrtoul instead patch found using checkpatch.pl Signed-off-by: Cosmin Stanescu Signed-off-by: David S. Miller --- net/dns_resolver/dns_key.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c index 0a69d0757795..f347a2ca7d7e 100644 --- a/net/dns_resolver/dns_key.c +++ b/net/dns_resolver/dns_key.c @@ -118,7 +118,7 @@ dns_resolver_instantiate(struct key *key, struct key_preparsed_payload *prep) if (opt_vlen <= 0) goto bad_option_value; - ret = strict_strtoul(eq, 10, &derrno); + ret = kstrtoul(eq, 10, &derrno); if (ret < 0) goto bad_option_value; -- cgit v1.2.3 From 40dadff26539d1695d2a37b44f66c53158439ae9 Mon Sep 17 00:00:00 2001 From: Sunghan Suh Date: Fri, 12 Jul 2013 16:17:23 +0900 Subject: net: access page->private by using page_private Signed-off-by: Sunghan Suh Signed-off-by: David S. Miller --- net/core/skbuff.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 724bb7cb173f..20e02d2605ec 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -824,7 +824,7 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) page = alloc_page(gfp_mask); if (!page) { while (head) { - struct page *next = (struct page *)head->private; + struct page *next = (struct page *)page_private(head); put_page(head); head = next; } @@ -834,7 +834,7 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) memcpy(page_address(page), vaddr + f->page_offset, skb_frag_size(f)); kunmap_atomic(vaddr); - page->private = (unsigned long)head; + set_page_private(page, (unsigned long)head); head = page; } @@ -848,7 +848,7 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) for (i = num_frags - 1; i >= 0; i--) { __skb_fill_page_desc(skb, i, head, 0, skb_shinfo(skb)->frags[i].size); - head = (struct page *)head->private; + head = (struct page *)page_private(head); } skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY; -- cgit v1.2.3 From 24ab6bec80861d0c55263047e8bf97e460a32e7b Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Fri, 12 Jul 2013 11:33:04 -0700 Subject: tcp: account all retransmit failures Change snmp RETRANSFAILS stat to include timeout retransmit failures in addition to other loss recoveries. Signed-off-by: Yuchung Cheng Acked-by: Neal Cardwell Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 3d609490f118..92fde8d1aa82 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2407,6 +2407,8 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) * see tcp_input.c tcp_sacktag_write_queue(). */ TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt; + } else { + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); } return err; } @@ -2528,10 +2530,9 @@ begin_fwd: if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS)) continue; - if (tcp_retransmit_skb(sk, skb)) { - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); + if (tcp_retransmit_skb(sk, skb)) return; - } + NET_INC_STATS_BH(sock_net(sk), mib_idx); if (tcp_in_cwnd_reduction(sk)) -- cgit v1.2.3 From 307f2fb95e9b96b3577916e73d92e104f8f26494 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Fri, 12 Jul 2013 23:46:33 +0200 Subject: ipv6: only static routes qualify for equal cost multipathing Static routes in this case are non-expiring routes which did not get configured by autoconf or by icmpv6 redirects. To make sure we actually get an ecmp route while searching for the first one in this fib6_node's leafs, also make sure it matches the ecmp route assumptions. v2: a) Removed RTF_EXPIRE check in dst.from chain. The check of RTF_ADDRCONF already ensures that this route, even if added again without RTF_EXPIRES (in case of a RA announcement with infinite timeout), does not cause the rt6i_nsiblings logic to go wrong if a later RA updates the expiration time later. v3: a) Allow RTF_EXPIRES routes to enter the ecmp route set. We have to do so, because an pmtu event could update the RTF_EXPIRES flag and we would not count this route, if another route joins this set. We now filter only for RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC, which are flags that don't get changed after rt6_info construction. Cc: Nicolas Dichtel Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 192dd1a0e188..5fc9c7a68d8d 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -632,6 +632,12 @@ insert_above: return ln; } +static inline bool rt6_qualify_for_ecmp(struct rt6_info *rt) +{ + return (rt->rt6i_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) == + RTF_GATEWAY; +} + /* * Insert routing information in a node. */ @@ -646,6 +652,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, int add = (!info->nlh || (info->nlh->nlmsg_flags & NLM_F_CREATE)); int found = 0; + bool rt_can_ecmp = rt6_qualify_for_ecmp(rt); ins = &fn->leaf; @@ -691,9 +698,8 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, * To avoid long list, we only had siblings if the * route have a gateway. */ - if (rt->rt6i_flags & RTF_GATEWAY && - !(rt->rt6i_flags & RTF_EXPIRES) && - !(iter->rt6i_flags & RTF_EXPIRES)) + if (rt_can_ecmp && + rt6_qualify_for_ecmp(iter)) rt->rt6i_nsiblings++; } @@ -715,7 +721,8 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, /* Find the first route that have the same metric */ sibling = fn->leaf; while (sibling) { - if (sibling->rt6i_metric == rt->rt6i_metric) { + if (sibling->rt6i_metric == rt->rt6i_metric && + rt6_qualify_for_ecmp(sibling)) { list_add_tail(&rt->rt6i_siblings, &sibling->rt6i_siblings); break; -- cgit v1.2.3