From d93c6258ee4255749c10012c50a31c08f4e9fb16 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 20 Jan 2016 11:16:43 +0100 Subject: netfilter: conntrack: resched in nf_ct_iterate_cleanup Ulrich reports soft lockup with following (shortened) callchain: NMI watchdog: BUG: soft lockup - CPU#1 stuck for 22s! __netif_receive_skb_core+0x6e4/0x774 process_backlog+0x94/0x160 net_rx_action+0x88/0x178 call_do_softirq+0x24/0x3c do_softirq+0x54/0x6c __local_bh_enable_ip+0x7c/0xbc nf_ct_iterate_cleanup+0x11c/0x22c [nf_conntrack] masq_inet_event+0x20/0x30 [nf_nat_masquerade_ipv6] atomic_notifier_call_chain+0x1c/0x2c ipv6_del_addr+0x1bc/0x220 [ipv6] Problem is that nf_ct_iterate_cleanup can run for a very long time since it can be interrupted by softirq processing. Moreover, atomic_notifier_call_chain runs with rcu readlock held. So lets call cond_resched() in nf_ct_iterate_cleanup and defer the call to a work queue for the atomic_notifier_call_chain case. We also need another cond_resched in get_next_corpse, since we have to deal with iter() always returning false, in that case get_next_corpse will walk entire conntrack table. Reported-by: Ulrich Weber Tested-by: Ulrich Weber Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv6/netfilter/nf_nat_masquerade_ipv6.c | 74 +++++++++++++++++++++++++++-- net/netfilter/nf_conntrack_core.c | 5 ++ 2 files changed, 76 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c index 31ba7ca19757..051b6a6bfff6 100644 --- a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c +++ b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c @@ -21,6 +21,10 @@ #include #include +#define MAX_WORK_COUNT 16 + +static atomic_t v6_worker_count; + unsigned int nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range *range, const struct net_device *out) @@ -78,14 +82,78 @@ static struct notifier_block masq_dev_notifier = { .notifier_call = masq_device_event, }; +struct masq_dev_work { + struct work_struct work; + struct net *net; + int ifindex; +}; + +static void iterate_cleanup_work(struct work_struct *work) +{ + struct masq_dev_work *w; + long index; + + w = container_of(work, struct masq_dev_work, work); + + index = w->ifindex; + nf_ct_iterate_cleanup(w->net, device_cmp, (void *)index, 0, 0); + + put_net(w->net); + kfree(w); + atomic_dec(&v6_worker_count); + module_put(THIS_MODULE); +} + +/* ipv6 inet notifier is an atomic notifier, i.e. we cannot + * schedule. + * + * Unfortunately, nf_ct_iterate_cleanup can run for a long + * time if there are lots of conntracks and the system + * handles high softirq load, so it frequently calls cond_resched + * while iterating the conntrack table. + * + * So we defer nf_ct_iterate_cleanup walk to the system workqueue. + * + * As we can have 'a lot' of inet_events (depending on amount + * of ipv6 addresses being deleted), we also need to add an upper + * limit to the number of queued work items. + */ static int masq_inet_event(struct notifier_block *this, unsigned long event, void *ptr) { struct inet6_ifaddr *ifa = ptr; - struct netdev_notifier_info info; + const struct net_device *dev; + struct masq_dev_work *w; + struct net *net; + + if (event != NETDEV_DOWN || + atomic_read(&v6_worker_count) >= MAX_WORK_COUNT) + return NOTIFY_DONE; + + dev = ifa->idev->dev; + net = maybe_get_net(dev_net(dev)); + if (!net) + return NOTIFY_DONE; - netdev_notifier_info_init(&info, ifa->idev->dev); - return masq_device_event(this, event, &info); + if (!try_module_get(THIS_MODULE)) + goto err_module; + + w = kmalloc(sizeof(*w), GFP_ATOMIC); + if (w) { + atomic_inc(&v6_worker_count); + + INIT_WORK(&w->work, iterate_cleanup_work); + w->ifindex = dev->ifindex; + w->net = net; + schedule_work(&w->work); + + return NOTIFY_DONE; + } + + module_put(THIS_MODULE); + err_module: + put_net(net); + return NOTIFY_DONE; } static struct notifier_block masq_inet_notifier = { diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 58882de06bd7..f60b4fdeeb8c 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1412,6 +1412,7 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data), } spin_unlock(lockp); local_bh_enable(); + cond_resched(); } for_each_possible_cpu(cpu) { @@ -1424,6 +1425,7 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data), set_bit(IPS_DYING_BIT, &ct->status); } spin_unlock_bh(&pcpu->lock); + cond_resched(); } return NULL; found: @@ -1440,6 +1442,8 @@ void nf_ct_iterate_cleanup(struct net *net, struct nf_conn *ct; unsigned int bucket = 0; + might_sleep(); + while ((ct = get_next_corpse(net, iter, data, &bucket)) != NULL) { /* Time to push up daises... */ if (del_timer(&ct->timeout)) @@ -1448,6 +1452,7 @@ void nf_ct_iterate_cleanup(struct net *net, /* ... else the timer will get him soon. */ nf_ct_put(ct); + cond_resched(); } } EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup); -- cgit v1.2.3 From 7c7bdf35991bb8f7cfaeaf22ea3a2f2d1967c166 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 24 Jan 2016 23:08:39 +0100 Subject: netfilter: nfnetlink: use original skbuff when acking batches Since bd678e09dc17 ("netfilter: nfnetlink: fix splat due to incorrect socket memory accounting in skbuff clones"), we don't manually attach the sk to the skbuff clone anymore, so we have to use the original skbuff from netlink_ack() which needs to access the sk pointer. Fixes: bd678e09dc17 ("netfilter: nfnetlink: fix splat due to incorrect socket memory accounting in skbuff clones") Reported-by: Dmitry Vyukov Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nfnetlink.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index a7ba23353dab..62e92af2384a 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -311,14 +311,14 @@ replay: #endif { nfnl_unlock(subsys_id); - netlink_ack(skb, nlh, -EOPNOTSUPP); + netlink_ack(oskb, nlh, -EOPNOTSUPP); return kfree_skb(skb); } } if (!ss->commit || !ss->abort) { nfnl_unlock(subsys_id); - netlink_ack(skb, nlh, -EOPNOTSUPP); + netlink_ack(oskb, nlh, -EOPNOTSUPP); return kfree_skb(skb); } @@ -406,7 +406,7 @@ ack: * pointing to the batch header. */ nfnl_err_reset(&err_list); - netlink_ack(skb, nlmsg_hdr(oskb), -ENOMEM); + netlink_ack(oskb, nlmsg_hdr(oskb), -ENOMEM); status |= NFNL_BATCH_FAILURE; goto done; } -- cgit v1.2.3 From 53c520c2ab79e9f3765d24116ab54f6d5b3cd563 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 28 Jan 2016 13:16:59 +0100 Subject: netfilter: cttimeout: fix deadlock due to erroneous unlock/lock conversion The spin_unlock call should have been left as-is, revert. Fixes: b16c29191dc89bd ("netfilter: nf_conntrack: use safer way to lock all buckets") Reported-by: kernel test robot Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nfnetlink_cttimeout.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index 94837d236ab0..2671b9deb103 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -312,7 +312,7 @@ static void ctnl_untimeout(struct net *net, struct ctnl_timeout *timeout) hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode) untimeout(h, timeout); } - nf_conntrack_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]); + spin_unlock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]); } local_bh_enable(); } -- cgit v1.2.3 From f224a6915f266921507bb6e50a82f87a3de5b4b5 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sun, 31 Jan 2016 14:35:59 +0100 Subject: crush: ensure bucket id is valid before indexing buckets array We were indexing the buckets array without verifying the index was within the [0,max_buckets) range. This could happen because a multistep rule does not have enough buckets and has CRUSH_ITEM_NONE for an intermediate result, which would feed in CRUSH_ITEM_NONE and make us crash. Reflects ceph.git commit 976a24a326da8931e689ee22fce35feab5b67b76. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- net/ceph/crush/mapper.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index 393bfb22d5bb..97ecf6f262aa 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -888,6 +888,7 @@ int crush_do_rule(const struct crush_map *map, osize = 0; for (i = 0; i < wsize; i++) { + int bno; /* * see CRUSH_N, CRUSH_N_MINUS macros. * basically, numrep <= 0 means relative to @@ -900,6 +901,13 @@ int crush_do_rule(const struct crush_map *map, continue; } j = 0; + /* make sure bucket id is valid */ + bno = -1 - w[i]; + if (bno < 0 || bno >= map->max_buckets) { + /* w[i] is probably CRUSH_ITEM_NONE */ + dprintk(" bad w[i] %d\n", w[i]); + continue; + } if (firstn) { int recurse_tries; if (choose_leaf_tries) @@ -911,7 +919,7 @@ int crush_do_rule(const struct crush_map *map, recurse_tries = choose_tries; osize += crush_choose_firstn( map, - map->buckets[-1-w[i]], + map->buckets[bno], weight, weight_max, x, numrep, curstep->arg2, @@ -930,7 +938,7 @@ int crush_do_rule(const struct crush_map *map, numrep : (result_max-osize)); crush_choose_indep( map, - map->buckets[-1-w[i]], + map->buckets[bno], weight, weight_max, x, out_size, numrep, curstep->arg2, -- cgit v1.2.3 From 56a4f3091dceb7dfc14dc3ef1d5f59fe39ba4447 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sun, 31 Jan 2016 14:36:05 +0100 Subject: crush: ensure take bucket value is valid Ensure that the take argument is a valid bucket ID before indexing the buckets array. Reflects ceph.git commit 93ec538e8a667699876b72459b8ad78966d89c61. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- net/ceph/crush/mapper.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index 97ecf6f262aa..abb700621e4a 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -835,7 +835,8 @@ int crush_do_rule(const struct crush_map *map, case CRUSH_RULE_TAKE: if ((curstep->arg1 >= 0 && curstep->arg1 < map->max_devices) || - (-1-curstep->arg1 < map->max_buckets && + (-1-curstep->arg1 >= 0 && + -1-curstep->arg1 < map->max_buckets && map->buckets[-1-curstep->arg1])) { w[0] = curstep->arg1; wsize = 1; -- cgit v1.2.3 From dc6ae6d8e7726bad4f1c87244b49cac851746c65 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sun, 31 Jan 2016 14:36:07 +0100 Subject: crush: add chooseleaf_stable tunable Add a tunable to fix the bug that chooseleaf may cause unnecessary pg migrations when some device fails. Reflects ceph.git commit fdb3f664448e80d984470f32f04e2e6f03ab52ec. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- net/ceph/crush/mapper.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index abb700621e4a..5fcfb98f309e 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -403,6 +403,7 @@ static int is_out(const struct crush_map *map, * @local_retries: localized retries * @local_fallback_retries: localized fallback retries * @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose) + * @stable: stable mode starts rep=0 in the recursive call for all replicas * @vary_r: pass r to recursive calls * @out2: second output vector for leaf items (if @recurse_to_leaf) * @parent_r: r value passed from the parent @@ -419,6 +420,7 @@ static int crush_choose_firstn(const struct crush_map *map, unsigned int local_fallback_retries, int recurse_to_leaf, unsigned int vary_r, + unsigned int stable, int *out2, int parent_r) { @@ -433,13 +435,13 @@ static int crush_choose_firstn(const struct crush_map *map, int collide, reject; int count = out_size; - dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d\n", + dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d stable %d\n", recurse_to_leaf ? "_LEAF" : "", bucket->id, x, outpos, numrep, tries, recurse_tries, local_retries, local_fallback_retries, - parent_r); + parent_r, stable); - for (rep = outpos; rep < numrep && count > 0 ; rep++) { + for (rep = stable ? 0 : outpos; rep < numrep && count > 0 ; rep++) { /* keep trying until we get a non-out, non-colliding item */ ftotal = 0; skip_rep = 0; @@ -512,13 +514,14 @@ static int crush_choose_firstn(const struct crush_map *map, if (crush_choose_firstn(map, map->buckets[-1-item], weight, weight_max, - x, outpos+1, 0, + x, stable ? 1 : outpos+1, 0, out2, outpos, count, recurse_tries, 0, local_retries, local_fallback_retries, 0, vary_r, + stable, NULL, sub_r) <= outpos) /* didn't get leaf */ @@ -816,6 +819,7 @@ int crush_do_rule(const struct crush_map *map, int choose_local_fallback_retries = map->choose_local_fallback_tries; int vary_r = map->chooseleaf_vary_r; + int stable = map->chooseleaf_stable; if ((__u32)ruleno >= map->max_rules) { dprintk(" bad ruleno %d\n", ruleno); @@ -870,6 +874,11 @@ int crush_do_rule(const struct crush_map *map, vary_r = curstep->arg1; break; + case CRUSH_RULE_SET_CHOOSELEAF_STABLE: + if (curstep->arg1 >= 0) + stable = curstep->arg1; + break; + case CRUSH_RULE_CHOOSELEAF_FIRSTN: case CRUSH_RULE_CHOOSE_FIRSTN: firstn = 1; @@ -932,6 +941,7 @@ int crush_do_rule(const struct crush_map *map, choose_local_fallback_retries, recurse_to_leaf, vary_r, + stable, c+osize, 0); } else { -- cgit v1.2.3 From b9b519b78cfbef9ed1b7aabca63eaaa9d1682a71 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 1 Feb 2016 16:57:16 +0100 Subject: crush: decode and initialize chooseleaf_stable Also add missing \n while at it. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- net/ceph/osdmap.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 7d8f581d9f1f..243574c8cf33 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -342,23 +342,32 @@ static struct crush_map *crush_decode(void *pbyval, void *end) c->choose_local_tries = ceph_decode_32(p); c->choose_local_fallback_tries = ceph_decode_32(p); c->choose_total_tries = ceph_decode_32(p); - dout("crush decode tunable choose_local_tries = %d", + dout("crush decode tunable choose_local_tries = %d\n", c->choose_local_tries); - dout("crush decode tunable choose_local_fallback_tries = %d", + dout("crush decode tunable choose_local_fallback_tries = %d\n", c->choose_local_fallback_tries); - dout("crush decode tunable choose_total_tries = %d", + dout("crush decode tunable choose_total_tries = %d\n", c->choose_total_tries); ceph_decode_need(p, end, sizeof(u32), done); c->chooseleaf_descend_once = ceph_decode_32(p); - dout("crush decode tunable chooseleaf_descend_once = %d", + dout("crush decode tunable chooseleaf_descend_once = %d\n", c->chooseleaf_descend_once); ceph_decode_need(p, end, sizeof(u8), done); c->chooseleaf_vary_r = ceph_decode_8(p); - dout("crush decode tunable chooseleaf_vary_r = %d", + dout("crush decode tunable chooseleaf_vary_r = %d\n", c->chooseleaf_vary_r); + /* skip straw_calc_version, allowed_bucket_algs */ + ceph_decode_need(p, end, sizeof(u8) + sizeof(u32), done); + *p += sizeof(u8) + sizeof(u32); + + ceph_decode_need(p, end, sizeof(u8), done); + c->chooseleaf_stable = ceph_decode_8(p); + dout("crush decode tunable chooseleaf_stable = %d\n", + c->chooseleaf_stable); + done: dout("crush_decode success\n"); return c; -- cgit v1.2.3 From b0b31a8ffe54abf0a455bcaee54dd92f08817164 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 3 Feb 2016 15:25:48 +0100 Subject: libceph: MOSDOpReply v7 encoding Empty request_redirect_t (struct ceph_request_redirect in the kernel client) is now encoded with a bool. NEW_OSDOPREPLY_ENCODING feature bit overlaps with already supported CRUSH_TUNABLES5. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- net/ceph/osd_client.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index f8f235930d88..3534e12683d3 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1770,6 +1770,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg) u32 osdmap_epoch; int already_completed; u32 bytes; + u8 decode_redir; unsigned int i; tid = le64_to_cpu(msg->hdr.tid); @@ -1841,6 +1842,15 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg) p += 8 + 4; /* skip replay_version */ p += 8; /* skip user_version */ + if (le16_to_cpu(msg->hdr.version) >= 7) + ceph_decode_8_safe(&p, end, decode_redir, bad_put); + else + decode_redir = 1; + } else { + decode_redir = 0; + } + + if (decode_redir) { err = ceph_redirect_decode(&p, end, &redir); if (err) goto bad_put; -- cgit v1.2.3 From 16186a82de1fdd868255448274e64ae2616e2640 Mon Sep 17 00:00:00 2001 From: "subashab@codeaurora.org" Date: Tue, 2 Feb 2016 02:11:10 +0000 Subject: ipv6: addrconf: Fix recursive spin lock call A rcu stall with the following backtrace was seen on a system with forwarding, optimistic_dad and use_optimistic set. To reproduce, set these flags and allow ipv6 autoconf. This occurs because the device write_lock is acquired while already holding the read_lock. Back trace below - INFO: rcu_preempt self-detected stall on CPU { 1} (t=2100 jiffies g=3992 c=3991 q=4471) <6> Task dump for CPU 1: <2> kworker/1:0 R running task 12168 15 2 0x00000002 <2> Workqueue: ipv6_addrconf addrconf_dad_work <6> Call trace: <2> [] el1_irq+0x68/0xdc <2> [] _raw_write_lock_bh+0x20/0x30 <2> [] __ipv6_dev_ac_inc+0x64/0x1b4 <2> [] addrconf_join_anycast+0x9c/0xc4 <2> [] __ipv6_ifa_notify+0x160/0x29c <2> [] ipv6_ifa_notify+0x50/0x70 <2> [] addrconf_dad_work+0x314/0x334 <2> [] process_one_work+0x244/0x3fc <2> [] worker_thread+0x2f8/0x418 <2> [] kthread+0xe0/0xec v2: do addrconf_dad_kick inside read lock and then acquire write lock for ipv6_ifa_notify as suggested by Eric Fixes: 7fd2561e4ebdd ("net: ipv6: Add a sysctl to make optimistic addresses useful candidates") Cc: Eric Dumazet Cc: Erik Kline Cc: Hannes Frederic Sowa Signed-off-by: Subash Abhinov Kasiviswanathan Acked-by: Hannes Frederic Sowa Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 38eeddedfc21..9efd9ffdc34c 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3538,6 +3538,7 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp) { struct inet6_dev *idev = ifp->idev; struct net_device *dev = idev->dev; + bool notify = false; addrconf_join_solict(dev, &ifp->addr); @@ -3583,7 +3584,7 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp) /* Because optimistic nodes can use this address, * notify listeners. If DAD fails, RTM_DELADDR is sent. */ - ipv6_ifa_notify(RTM_NEWADDR, ifp); + notify = true; } } @@ -3591,6 +3592,8 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp) out: spin_unlock(&ifp->lock); read_unlock_bh(&idev->lock); + if (notify) + ipv6_ifa_notify(RTM_NEWADDR, ifp); } static void addrconf_dad_start(struct inet6_ifaddr *ifp) -- cgit v1.2.3 From c58d6c93680f28ac58984af61d0a7ebf4319c241 Mon Sep 17 00:00:00 2001 From: Phil Turnbull Date: Tue, 2 Feb 2016 13:36:45 -0500 Subject: netfilter: nfnetlink: correctly validate length of batch messages If nlh->nlmsg_len is zero then an infinite loop is triggered because 'skb_pull(skb, msglen);' pulls zero bytes. The calculation in nlmsg_len() underflows if 'nlh->nlmsg_len < NLMSG_HDRLEN' which bypasses the length validation and will later trigger an out-of-bound read. If the length validation does fail then the malformed batch message is copied back to userspace. However, we cannot do this because the nlh->nlmsg_len can be invalid. This leads to an out-of-bounds read in netlink_ack: [ 41.455421] ================================================================== [ 41.456431] BUG: KASAN: slab-out-of-bounds in memcpy+0x1d/0x40 at addr ffff880119e79340 [ 41.456431] Read of size 4294967280 by task a.out/987 [ 41.456431] ============================================================================= [ 41.456431] BUG kmalloc-512 (Not tainted): kasan: bad access detected [ 41.456431] ----------------------------------------------------------------------------- ... [ 41.456431] Bytes b4 ffff880119e79310: 00 00 00 00 d5 03 00 00 b0 fb fe ff 00 00 00 00 ................ [ 41.456431] Object ffff880119e79320: 20 00 00 00 10 00 05 00 00 00 00 00 00 00 00 00 ............... [ 41.456431] Object ffff880119e79330: 14 00 0a 00 01 03 fc 40 45 56 11 22 33 10 00 05 .......@EV."3... [ 41.456431] Object ffff880119e79340: f0 ff ff ff 88 99 aa bb 00 14 00 0a 00 06 fe fb ................ ^^ start of batch nlmsg with nlmsg_len=4294967280 ... [ 41.456431] Memory state around the buggy address: [ 41.456431] ffff880119e79400: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 41.456431] ffff880119e79480: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 41.456431] >ffff880119e79500: 00 00 00 00 fc fc fc fc fc fc fc fc fc fc fc fc [ 41.456431] ^ [ 41.456431] ffff880119e79580: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 41.456431] ffff880119e79600: fc fc fc fc fc fc fc fc fc fc fb fb fb fb fb fb [ 41.456431] ================================================================== Fix this with better validation of nlh->nlmsg_len and by setting NFNL_BATCH_FAILURE if any batch message fails length validation. CAP_NET_ADMIN is required to trigger the bugs. Fixes: 9ea2aa8b7dba ("netfilter: nfnetlink: validate nfnetlink header from batch") Signed-off-by: Phil Turnbull Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nfnetlink.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 62e92af2384a..857ae89633af 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -328,10 +328,12 @@ replay: nlh = nlmsg_hdr(skb); err = 0; - if (nlmsg_len(nlh) < sizeof(struct nfgenmsg) || - skb->len < nlh->nlmsg_len) { - err = -EINVAL; - goto ack; + if (nlh->nlmsg_len < NLMSG_HDRLEN || + skb->len < nlh->nlmsg_len || + nlmsg_len(nlh) < sizeof(struct nfgenmsg)) { + nfnl_err_reset(&err_list); + status |= NFNL_BATCH_FAILURE; + goto done; } /* Only requests are handled by the kernel */ -- cgit v1.2.3 From 08a7f5d3f5c38ed745c3e99ee91975f20562d272 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 5 Feb 2016 10:20:21 +0100 Subject: netfilter: tee: select NF_DUP_IPV6 unconditionally The NETFILTER_XT_TARGET_TEE option selects NF_DUP_IPV6 whenever IP6_NF_IPTABLES is enabled, and it ensures that it cannot be builtin itself if NF_CONNTRACK is a loadable module, as that is a dependency for NF_DUP_IPV6. However, NF_DUP_IPV6 can be enabled even if IP6_NF_IPTABLES is turned off, and it only really depends on IPV6. With the current check in tee_tg6, we call nf_dup_ipv6() whenever NF_DUP_IPV6 is enabled. This can however be a loadable module which is unreachable from a built-in xt_TEE: net/built-in.o: In function `tee_tg6': :(.text+0x67728): undefined reference to `nf_dup_ipv6' The bug was originally introduced in the split of the xt_TEE module into separate modules for ipv4 and ipv6, and two patches tried to fix it unsuccessfully afterwards. This is a revert of the the first incorrect attempt to fix it, going back to depending on IPV6 as the dependency, and we adapt the 'select' condition accordingly. Signed-off-by: Arnd Bergmann Fixes: bbde9fc1824a ("netfilter: factor out packet duplication for IPv4/IPv6") Fixes: 116984a316c3 ("netfilter: xt_TEE: use IS_ENABLED(CONFIG_NF_DUP_IPV6)") Fixes: 74ec4d55c4d2 ("netfilter: fix xt_TEE and xt_TPROXY dependencies") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/Kconfig | 2 +- net/netfilter/xt_TEE.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 8c067e6663a1..95e757c377f9 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -891,7 +891,7 @@ config NETFILTER_XT_TARGET_TEE depends on IPV6 || IPV6=n depends on !NF_CONNTRACK || NF_CONNTRACK select NF_DUP_IPV4 - select NF_DUP_IPV6 if IP6_NF_IPTABLES != n + select NF_DUP_IPV6 if IPV6 ---help--- This option adds a "TEE" target with which a packet can be cloned and this clone be rerouted to another nexthop. diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c index 3eff7b67cdf2..6e57a3966dc5 100644 --- a/net/netfilter/xt_TEE.c +++ b/net/netfilter/xt_TEE.c @@ -38,7 +38,7 @@ tee_tg4(struct sk_buff *skb, const struct xt_action_param *par) return XT_CONTINUE; } -#if IS_ENABLED(CONFIG_NF_DUP_IPV6) +#if IS_ENABLED(CONFIG_IPV6) static unsigned int tee_tg6(struct sk_buff *skb, const struct xt_action_param *par) { @@ -131,7 +131,7 @@ static struct xt_target tee_tg_reg[] __read_mostly = { .destroy = tee_tg_destroy, .me = THIS_MODULE, }, -#if IS_ENABLED(CONFIG_NF_DUP_IPV6) +#if IS_ENABLED(CONFIG_IPV6) { .name = "TEE", .revision = 1, -- cgit v1.2.3 From 5cc6ce9ff27565949a1001a2889a8dd9fd09e772 Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Sat, 6 Feb 2016 23:31:19 -0500 Subject: netfilter: nft_counter: fix erroneous return values The nft_counter_init() and nft_counter_clone() functions should return negative error value -ENOMEM instead of positive ENOMEM. Signed-off-by: Anton Protopopov Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_counter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c index c7808fc19719..c9743f78f219 100644 --- a/net/netfilter/nft_counter.c +++ b/net/netfilter/nft_counter.c @@ -100,7 +100,7 @@ static int nft_counter_init(const struct nft_ctx *ctx, cpu_stats = netdev_alloc_pcpu_stats(struct nft_counter_percpu); if (cpu_stats == NULL) - return ENOMEM; + return -ENOMEM; preempt_disable(); this_cpu = this_cpu_ptr(cpu_stats); @@ -138,7 +138,7 @@ static int nft_counter_clone(struct nft_expr *dst, const struct nft_expr *src) cpu_stats = __netdev_alloc_pcpu_stats(struct nft_counter_percpu, GFP_ATOMIC); if (cpu_stats == NULL) - return ENOMEM; + return -ENOMEM; preempt_disable(); this_cpu = this_cpu_ptr(cpu_stats); -- cgit v1.2.3 From 415e3d3e90ce9e18727e8843ae343eda5a58fad6 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Wed, 3 Feb 2016 02:11:03 +0100 Subject: unix: correctly track in-flight fds in sending process user_struct The commit referenced in the Fixes tag incorrectly accounted the number of in-flight fds over a unix domain socket to the original opener of the file-descriptor. This allows another process to arbitrary deplete the original file-openers resource limit for the maximum of open files. Instead the sending processes and its struct cred should be credited. To do so, we add a reference counted struct user_struct pointer to the scm_fp_list and use it to account for the number of inflight unix fds. Fixes: 712f4aad406bb1 ("unix: properly account for FDs passed over unix sockets") Reported-by: David Herrmann Cc: David Herrmann Cc: Willy Tarreau Cc: Linus Torvalds Suggested-by: Linus Torvalds Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/core/scm.c | 7 +++++++ net/unix/af_unix.c | 4 ++-- net/unix/garbage.c | 8 ++++---- 3 files changed, 13 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/core/scm.c b/net/core/scm.c index 14596fb37172..2696aefdc148 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -87,6 +87,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) *fplp = fpl; fpl->count = 0; fpl->max = SCM_MAX_FD; + fpl->user = NULL; } fpp = &fpl->fp[fpl->count]; @@ -107,6 +108,10 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) *fpp++ = file; fpl->count++; } + + if (!fpl->user) + fpl->user = get_uid(current_user()); + return num; } @@ -119,6 +124,7 @@ void __scm_destroy(struct scm_cookie *scm) scm->fp = NULL; for (i=fpl->count-1; i>=0; i--) fput(fpl->fp[i]); + free_uid(fpl->user); kfree(fpl); } } @@ -336,6 +342,7 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl) for (i = 0; i < fpl->count; i++) get_file(fpl->fp[i]); new_fpl->max = new_fpl->count; + new_fpl->user = get_uid(fpl->user); } return new_fpl; } diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 49d5093eb055..29be035f9c65 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1496,7 +1496,7 @@ static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) UNIXCB(skb).fp = NULL; for (i = scm->fp->count-1; i >= 0; i--) - unix_notinflight(scm->fp->fp[i]); + unix_notinflight(scm->fp->user, scm->fp->fp[i]); } static void unix_destruct_scm(struct sk_buff *skb) @@ -1561,7 +1561,7 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) return -ENOMEM; for (i = scm->fp->count - 1; i >= 0; i--) - unix_inflight(scm->fp->fp[i]); + unix_inflight(scm->fp->user, scm->fp->fp[i]); return max_level; } diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 8fcdc2283af5..6a0d48525fcf 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -116,7 +116,7 @@ struct sock *unix_get_socket(struct file *filp) * descriptor if it is for an AF_UNIX socket. */ -void unix_inflight(struct file *fp) +void unix_inflight(struct user_struct *user, struct file *fp) { struct sock *s = unix_get_socket(fp); @@ -133,11 +133,11 @@ void unix_inflight(struct file *fp) } unix_tot_inflight++; } - fp->f_cred->user->unix_inflight++; + user->unix_inflight++; spin_unlock(&unix_gc_lock); } -void unix_notinflight(struct file *fp) +void unix_notinflight(struct user_struct *user, struct file *fp) { struct sock *s = unix_get_socket(fp); @@ -152,7 +152,7 @@ void unix_notinflight(struct file *fp) list_del_init(&u->link); unix_tot_inflight--; } - fp->f_cred->user->unix_inflight--; + user->unix_inflight--; spin_unlock(&unix_gc_lock); } -- cgit v1.2.3 From 44c3d0c1c0a880354e9de5d94175742e2c7c9683 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 2 Feb 2016 17:55:01 -0800 Subject: ipv6: fix a lockdep splat Silence lockdep false positive about rcu_dereference() being used in the wrong context. First one should use rcu_dereference_protected() as we own the spinlock. Second one should be a normal assignation, as no barrier is needed. Fixes: 18367681a10bd ("ipv6 flowlabel: Convert np->ipv6_fl_list to RCU.") Reported-by: Dave Jones Signed-off-by: Eric Dumazet Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/ipv6/ip6_flowlabel.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index 1f9ebe3cbb4a..dc2db4f7b182 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -540,12 +540,13 @@ int ipv6_flowlabel_opt(struct sock *sk, char __user *optval, int optlen) } spin_lock_bh(&ip6_sk_fl_lock); for (sflp = &np->ipv6_fl_list; - (sfl = rcu_dereference(*sflp)) != NULL; + (sfl = rcu_dereference_protected(*sflp, + lockdep_is_held(&ip6_sk_fl_lock))) != NULL; sflp = &sfl->next) { if (sfl->fl->label == freq.flr_label) { if (freq.flr_label == (np->flow_label&IPV6_FLOWLABEL_MASK)) np->flow_label &= ~IPV6_FLOWLABEL_MASK; - *sflp = rcu_dereference(sfl->next); + *sflp = sfl->next; spin_unlock_bh(&ip6_sk_fl_lock); fl_release(sfl->fl); kfree_rcu(sfl, rcu); -- cgit v1.2.3 From 9cf7490360bf2c46a16b7525f899e4970c5fc144 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 2 Feb 2016 19:31:12 -0800 Subject: tcp: do not drop syn_recv on all icmp reports Petr Novopashenniy reported that ICMP redirects on SYN_RECV sockets were leading to RST. This is of course incorrect. A specific list of ICMP messages should be able to drop a SYN_RECV. For instance, a REDIRECT on SYN_RECV shall be ignored, as we do not hold a dst per SYN_RECV pseudo request. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=111751 Fixes: 079096f103fa ("tcp/dccp: install syn_recv requests into ehash table") Reported-by: Petr Novopashenniy Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 11 ++++++++--- net/ipv6/tcp_ipv6.c | 5 +++-- 2 files changed, 11 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a4d523709ab3..7f6ff037adaf 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -311,7 +311,7 @@ static void do_redirect(struct sk_buff *skb, struct sock *sk) /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */ -void tcp_req_err(struct sock *sk, u32 seq) +void tcp_req_err(struct sock *sk, u32 seq, bool abort) { struct request_sock *req = inet_reqsk(sk); struct net *net = sock_net(sk); @@ -323,7 +323,7 @@ void tcp_req_err(struct sock *sk, u32 seq) if (seq != tcp_rsk(req)->snt_isn) { NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); - } else { + } else if (abort) { /* * Still in SYN_RECV, just remove it silently. * There is no good way to pass the error to the newly @@ -383,7 +383,12 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) } seq = ntohl(th->seq); if (sk->sk_state == TCP_NEW_SYN_RECV) - return tcp_req_err(sk, seq); + return tcp_req_err(sk, seq, + type == ICMP_PARAMETERPROB || + type == ICMP_TIME_EXCEEDED || + (type == ICMP_DEST_UNREACH && + (code == ICMP_NET_UNREACH || + code == ICMP_HOST_UNREACH))); bh_lock_sock(sk); /* If too many ICMPs get dropped on busy diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 006396e31cb0..1a5a70fb8551 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -327,6 +327,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, struct tcp_sock *tp; __u32 seq, snd_una; struct sock *sk; + bool fatal; int err; sk = __inet6_lookup_established(net, &tcp_hashinfo, @@ -345,8 +346,9 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, return; } seq = ntohl(th->seq); + fatal = icmpv6_err_convert(type, code, &err); if (sk->sk_state == TCP_NEW_SYN_RECV) - return tcp_req_err(sk, seq); + return tcp_req_err(sk, seq, fatal); bh_lock_sock(sk); if (sock_owned_by_user(sk) && type != ICMPV6_PKT_TOOBIG) @@ -400,7 +402,6 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, goto out; } - icmpv6_err_convert(type, code, &err); /* Might be for an request_sock */ switch (sk->sk_state) { -- cgit v1.2.3 From 5f74f82ea34c0da80ea0b49192bb5ea06e063593 Mon Sep 17 00:00:00 2001 From: Hans Westgaard Ry Date: Wed, 3 Feb 2016 09:26:57 +0100 Subject: net:Add sysctl_max_skb_frags MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Devices may have limits on the number of fragments in an skb they support. Current codebase uses a constant as maximum for number of fragments one skb can hold and use. When enabling scatter/gather and running traffic with many small messages the codebase uses the maximum number of fragments and may thereby violate the max for certain devices. The patch introduces a global variable as max number of fragments. Signed-off-by: Hans Westgaard Ry Reviewed-by: HÃ¥kon Bugge Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/skbuff.c | 2 ++ net/core/sysctl_net_core.c | 10 ++++++++++ net/ipv4/tcp.c | 4 ++-- 3 files changed, 14 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index b2df375ec9c2..5bf88f58bee7 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -79,6 +79,8 @@ struct kmem_cache *skbuff_head_cache __read_mostly; static struct kmem_cache *skbuff_fclone_cache __read_mostly; +int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS; +EXPORT_SYMBOL(sysctl_max_skb_frags); /** * skb_panic - private function for out-of-line support diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 95b6139d710c..a6beb7b6ae55 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -26,6 +26,7 @@ static int zero = 0; static int one = 1; static int min_sndbuf = SOCK_MIN_SNDBUF; static int min_rcvbuf = SOCK_MIN_RCVBUF; +static int max_skb_frags = MAX_SKB_FRAGS; static int net_msg_warn; /* Unused, but still a sysctl */ @@ -392,6 +393,15 @@ static struct ctl_table net_core_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "max_skb_frags", + .data = &sysctl_max_skb_frags, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, + .extra2 = &max_skb_frags, + }, { } }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 19746b3fcbbe..0c36ef4a3f86 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -940,7 +940,7 @@ new_segment: i = skb_shinfo(skb)->nr_frags; can_coalesce = skb_can_coalesce(skb, i, page, offset); - if (!can_coalesce && i >= MAX_SKB_FRAGS) { + if (!can_coalesce && i >= sysctl_max_skb_frags) { tcp_mark_push(tp, skb); goto new_segment; } @@ -1213,7 +1213,7 @@ new_segment: if (!skb_can_coalesce(skb, i, pfrag->page, pfrag->offset)) { - if (i == MAX_SKB_FRAGS || !sg) { + if (i == sysctl_max_skb_frags || !sg) { tcp_mark_push(tp, skb); goto new_segment; } -- cgit v1.2.3 From 7a84bd46647ff181eb2659fdc99590e6f16e501d Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 3 Feb 2016 23:33:30 +0800 Subject: sctp: translate network order to host order when users get a hmacid Commit ed5a377d87dc ("sctp: translate host order to network order when setting a hmacid") corrected the hmacid byte-order when setting a hmacid. but the same issue also exists on getting a hmacid. We fix it by changing hmacids to host order when users get them with getsockopt. Fixes: Commit ed5a377d87dc ("sctp: translate host order to network order when setting a hmacid") Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/socket.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 5ca2ebfe0be8..e878da0949db 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -5538,6 +5538,7 @@ static int sctp_getsockopt_hmac_ident(struct sock *sk, int len, struct sctp_hmac_algo_param *hmacs; __u16 data_len = 0; u32 num_idents; + int i; if (!ep->auth_enable) return -EACCES; @@ -5555,8 +5556,12 @@ static int sctp_getsockopt_hmac_ident(struct sock *sk, int len, return -EFAULT; if (put_user(num_idents, &p->shmac_num_idents)) return -EFAULT; - if (copy_to_user(p->shmac_idents, hmacs->hmac_ids, data_len)) - return -EFAULT; + for (i = 0; i < num_idents; i++) { + __u16 hmacid = ntohs(hmacs->hmac_ids[i]); + + if (copy_to_user(&p->shmac_idents[i], &hmacid, sizeof(__u16))) + return -EFAULT; + } return 0; } -- cgit v1.2.3 From 461547f3158978c180d74484d58e82be9b8e7357 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Tue, 9 Feb 2016 02:49:54 -0800 Subject: flow_dissector: Fix unaligned access in __skb_flow_dissector when used by eth_get_headlen This patch fixes an issue with unaligned accesses when using eth_get_headlen on a page that was DMA aligned instead of being IP aligned. The fact is when trying to check the length we don't need to be looking at the flow label so we can reorder the checks to first check if we are supposed to gather the flow label and then make the call to actually get it. v2: Updated path so that either STOP_AT_FLOW_LABEL or KEY_FLOW_LABEL can cause us to check for the flow label. Reported-by: Sowmini Varadhan Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index d79699c9d1b9..eab81bc80e5c 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -208,7 +208,6 @@ ip: case htons(ETH_P_IPV6): { const struct ipv6hdr *iph; struct ipv6hdr _iph; - __be32 flow_label; ipv6: iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph); @@ -230,8 +229,12 @@ ipv6: key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; } - flow_label = ip6_flowlabel(iph); - if (flow_label) { + if ((dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_FLOW_LABEL) || + (flags & FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL)) && + ip6_flowlabel(iph)) { + __be32 flow_label = ip6_flowlabel(iph); + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_FLOW_LABEL)) { key_tags = skb_flow_dissector_target(flow_dissector, -- cgit v1.2.3 From 7e059158d57b79159eaf1f504825d19866ef2c42 Mon Sep 17 00:00:00 2001 From: David Wragg Date: Wed, 10 Feb 2016 00:05:58 +0000 Subject: vxlan, gre, geneve: Set a large MTU on ovs-created tunnel devices Prior to 4.3, openvswitch tunnel vports (vxlan, gre and geneve) could transmit vxlan packets of any size, constrained only by the ability to send out the resulting packets. 4.3 introduced netdevs corresponding to tunnel vports. These netdevs have an MTU, which limits the size of a packet that can be successfully encapsulated. The default MTU values are low (1500 or less), which is awkwardly small in the context of physical networks supporting jumbo frames, and leads to a conspicuous change in behaviour for userspace. Instead, set the MTU on openvswitch-created netdevs to be the relevant maximum (i.e. the maximum IP packet size minus any relevant overhead), effectively restoring the behaviour prior to 4.3. Signed-off-by: David Wragg Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 8 ++++++++ net/ipv4/ip_tunnel.c | 20 +++++++++++++++++--- net/openvswitch/vport-vxlan.c | 2 ++ 3 files changed, 27 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 7c51c4e1661f..56fdf4e0dce4 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1240,6 +1240,14 @@ struct net_device *gretap_fb_dev_create(struct net *net, const char *name, err = ipgre_newlink(net, dev, tb, NULL); if (err < 0) goto out; + + /* openvswitch users expect packet sizes to be unrestricted, + * so set the largest MTU we can. + */ + err = __ip_tunnel_change_mtu(dev, IP_MAX_MTU, false); + if (err) + goto out; + return dev; out: free_netdev(dev); diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index c7bd72e9b544..89e8861e05fc 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -943,17 +943,31 @@ done: } EXPORT_SYMBOL_GPL(ip_tunnel_ioctl); -int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu) +int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict) { struct ip_tunnel *tunnel = netdev_priv(dev); int t_hlen = tunnel->hlen + sizeof(struct iphdr); + int max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen; - if (new_mtu < 68 || - new_mtu > 0xFFF8 - dev->hard_header_len - t_hlen) + if (new_mtu < 68) return -EINVAL; + + if (new_mtu > max_mtu) { + if (strict) + return -EINVAL; + + new_mtu = max_mtu; + } + dev->mtu = new_mtu; return 0; } +EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu); + +int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu) +{ + return __ip_tunnel_change_mtu(dev, new_mtu, true); +} EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu); static void ip_tunnel_dev_free(struct net_device *dev) diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c index 1605691d9414..de9cb19efb6a 100644 --- a/net/openvswitch/vport-vxlan.c +++ b/net/openvswitch/vport-vxlan.c @@ -91,6 +91,8 @@ static struct vport *vxlan_tnl_create(const struct vport_parms *parms) struct vxlan_config conf = { .no_share = true, .flags = VXLAN_F_COLLECT_METADATA, + /* Don't restrict the packets that can be sent by MTU */ + .mtu = IP_MAX_MTU, }; if (!options) { -- cgit v1.2.3 From 919483096bfe75dda338e98d56da91a263746a0a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 4 Feb 2016 06:23:28 -0800 Subject: ipv4: fix memory leaks in ip_cmsg_send() callers Dmitry reported memory leaks of IP options allocated in ip_cmsg_send() when/if this function returns an error. Callers are responsible for the freeing. Many thanks to Dmitry for the report and diagnostic. Reported-by: Dmitry Vyukov Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ip_sockglue.c | 2 ++ net/ipv4/ping.c | 4 +++- net/ipv4/raw.c | 4 +++- net/ipv4/udp.c | 4 +++- 4 files changed, 11 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 5f73a7c03e27..a50124260f5a 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -249,6 +249,8 @@ int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc, switch (cmsg->cmsg_type) { case IP_RETOPTS: err = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr)); + + /* Our caller is responsible for freeing ipc->opt */ err = ip_options_get(net, &ipc->opt, CMSG_DATA(cmsg), err < 40 ? err : 40); if (err) diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index c117b21b937d..d3a27165f9cc 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -746,8 +746,10 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (msg->msg_controllen) { err = ip_cmsg_send(sock_net(sk), msg, &ipc, false); - if (err) + if (unlikely(err)) { + kfree(ipc.opt); return err; + } if (ipc.opt) free = 1; } diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index bc35f1842512..7113bae4e6a0 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -547,8 +547,10 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (msg->msg_controllen) { err = ip_cmsg_send(net, msg, &ipc, false); - if (err) + if (unlikely(err)) { + kfree(ipc.opt); goto out; + } if (ipc.opt) free = 1; } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index be0b21852b13..95d2f198017e 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1048,8 +1048,10 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (msg->msg_controllen) { err = ip_cmsg_send(sock_net(sk), msg, &ipc, sk->sk_family == AF_INET6); - if (err) + if (unlikely(err)) { + kfree(ipc.opt); return err; + } if (ipc.opt) free = 1; connected = 0; -- cgit v1.2.3 From 5988818008257ca42010d6b43a3e0e48afec9898 Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Thu, 4 Feb 2016 10:50:45 -0800 Subject: vsock: Fix blocking ops call in prepare_to_wait We receoved a bug report from someone using vmware: WARNING: CPU: 3 PID: 660 at kernel/sched/core.c:7389 __might_sleep+0x7d/0x90() do not call blocking ops when !TASK_RUNNING; state=1 set at [] prepare_to_wait+0x2d/0x90 Modules linked in: vmw_vsock_vmci_transport vsock snd_seq_midi snd_seq_midi_event snd_ens1371 iosf_mbi gameport snd_rawmidi snd_ac97_codec ac97_bus snd_seq coretemp snd_seq_device snd_pcm snd_timer snd soundcore ppdev crct10dif_pclmul crc32_pclmul ghash_clmulni_intel vmw_vmci vmw_balloon i2c_piix4 shpchp parport_pc parport acpi_cpufreq nfsd auth_rpcgss nfs_acl lockd grace sunrpc btrfs xor raid6_pq 8021q garp stp llc mrp crc32c_intel serio_raw mptspi vmwgfx drm_kms_helper ttm drm scsi_transport_spi mptscsih e1000 ata_generic mptbase pata_acpi CPU: 3 PID: 660 Comm: vmtoolsd Not tainted 4.2.0-0.rc1.git3.1.fc23.x86_64 #1 Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 05/20/2014 0000000000000000 0000000049e617f3 ffff88006ac37ac8 ffffffff818641f5 0000000000000000 ffff88006ac37b20 ffff88006ac37b08 ffffffff810ab446 ffff880068009f40 ffffffff81c63bc0 0000000000000061 0000000000000000 Call Trace: [] dump_stack+0x4c/0x65 [] warn_slowpath_common+0x86/0xc0 [] warn_slowpath_fmt+0x55/0x70 [] ? debug_lockdep_rcu_enabled+0x1d/0x20 [] ? prepare_to_wait+0x2d/0x90 [] ? prepare_to_wait+0x2d/0x90 [] __might_sleep+0x7d/0x90 [] __might_fault+0x43/0xa0 [] copy_from_iter+0x87/0x2a0 [] __qp_memcpy_to_queue+0x9a/0x1b0 [vmw_vmci] [] ? qp_memcpy_to_queue+0x20/0x20 [vmw_vmci] [] qp_memcpy_to_queue_iov+0x17/0x20 [vmw_vmci] [] qp_enqueue_locked+0xa0/0x140 [vmw_vmci] [] vmci_qpair_enquev+0x4f/0xd0 [vmw_vmci] [] vmci_transport_stream_enqueue+0x1b/0x20 [vmw_vsock_vmci_transport] [] vsock_stream_sendmsg+0x2c5/0x320 [vsock] [] ? wake_atomic_t_function+0x70/0x70 [] sock_sendmsg+0x38/0x50 [] SYSC_sendto+0x104/0x190 [] ? vfs_read+0x8a/0x140 [] SyS_sendto+0xe/0x10 [] entry_SYSCALL_64_fastpath+0x12/0x76 transport->stream_enqueue may call copy_to_user so it should not be called inside a prepare_to_wait. Narrow the scope of the prepare_to_wait to avoid the bad call. This also applies to vsock_stream_recvmsg as well. Reported-by: Vinson Lee Tested-by: Vinson Lee Signed-off-by: Laura Abbott Signed-off-by: David S. Miller --- net/vmw_vsock/af_vsock.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 7fd1220fbfa0..bbe65dcb9738 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1557,8 +1557,6 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, if (err < 0) goto out; - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); - while (total_written < len) { ssize_t written; @@ -1578,7 +1576,9 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, goto out_wait; release_sock(sk); + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); timeout = schedule_timeout(timeout); + finish_wait(sk_sleep(sk), &wait); lock_sock(sk); if (signal_pending(current)) { err = sock_intr_errno(timeout); @@ -1588,8 +1588,6 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, goto out_wait; } - prepare_to_wait(sk_sleep(sk), &wait, - TASK_INTERRUPTIBLE); } /* These checks occur both as part of and after the loop @@ -1635,7 +1633,6 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, out_wait: if (total_written > 0) err = total_written; - finish_wait(sk_sleep(sk), &wait); out: release_sock(sk); return err; @@ -1716,7 +1713,6 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, if (err < 0) goto out; - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); while (1) { s64 ready = vsock_stream_has_data(vsk); @@ -1727,7 +1723,7 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, */ err = -ENOMEM; - goto out_wait; + goto out; } else if (ready > 0) { ssize_t read; @@ -1750,7 +1746,7 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, vsk, target, read, !(flags & MSG_PEEK), &recv_data); if (err < 0) - goto out_wait; + goto out; if (read >= target || flags & MSG_PEEK) break; @@ -1773,7 +1769,9 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, break; release_sock(sk); + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); timeout = schedule_timeout(timeout); + finish_wait(sk_sleep(sk), &wait); lock_sock(sk); if (signal_pending(current)) { @@ -1783,9 +1781,6 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, err = -EAGAIN; break; } - - prepare_to_wait(sk_sleep(sk), &wait, - TASK_INTERRUPTIBLE); } } @@ -1816,8 +1811,6 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, err = copied; } -out_wait: - finish_wait(sk_sleep(sk), &wait); out: release_sock(sk); return err; -- cgit v1.2.3 From c18bdd018e8912ca73ad6c12120b7283b5038875 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sun, 31 Jan 2016 13:27:59 +0100 Subject: batman-adv: Only put gw_node list reference when removed The batadv_gw_node reference counter in batadv_gw_node_update can only be reduced when the list entry was actually removed. Otherwise the reference counter may reach zero when batadv_gw_node_update is called from two different contexts for the same gw_node but only one context is actually removing the entry from the list. The release function for this gw_node is not called inside the list_lock spinlock protected region because the function batadv_gw_node_update still holds a gw_node reference for the object pointer on the stack. Thus the actual release function (when required) will be called only at the end of the function. Fixes: bd3524c14bd0 ("batman-adv: remove obsolete deleted attribute for gateway node") Signed-off-by: Sven Eckelmann Signed-off-by: Marek Lindner Signed-off-by: Antonio Quartulli --- net/batman-adv/gateway_client.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index e6c8382c79ba..ccf70bed0d0c 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -527,11 +527,12 @@ void batadv_gw_node_update(struct batadv_priv *bat_priv, * gets dereferenced. */ spin_lock_bh(&bat_priv->gw.list_lock); - hlist_del_init_rcu(&gw_node->list); + if (!hlist_unhashed(&gw_node->list)) { + hlist_del_init_rcu(&gw_node->list); + batadv_gw_node_free_ref(gw_node); + } spin_unlock_bh(&bat_priv->gw.list_lock); - batadv_gw_node_free_ref(gw_node); - curr_gw = batadv_gw_get_selected_gw_node(bat_priv); if (gw_node == curr_gw) batadv_gw_reselect(bat_priv); -- cgit v1.2.3 From 3db152093efb750bc47fd4d69355b90b18113105 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sun, 31 Jan 2016 13:28:00 +0100 Subject: batman-adv: Only put orig_node_vlan list reference when removed The batadv_orig_node_vlan reference counter in batadv_tt_global_size_mod can only be reduced when the list entry was actually removed. Otherwise the reference counter may reach zero when batadv_tt_global_size_mod is called from two different contexts for the same orig_node_vlan but only one context is actually removing the entry from the list. The release function for this orig_node_vlan is not called inside the vlan_list_lock spinlock protected region because the function batadv_tt_global_size_mod still holds a orig_node_vlan reference for the object pointer on the stack. Thus the actual release function (when required) will be called only at the end of the function. Fixes: 7ea7b4a14275 ("batman-adv: make the TT CRC logic VLAN specific") Signed-off-by: Sven Eckelmann Signed-off-by: Marek Lindner Signed-off-by: Antonio Quartulli --- net/batman-adv/translation-table.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index cdfc85fa2743..0e80fd1461ab 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -303,9 +303,11 @@ static void batadv_tt_global_size_mod(struct batadv_orig_node *orig_node, if (atomic_add_return(v, &vlan->tt.num_entries) == 0) { spin_lock_bh(&orig_node->vlan_list_lock); - hlist_del_init_rcu(&vlan->list); + if (!hlist_unhashed(&vlan->list)) { + hlist_del_init_rcu(&vlan->list); + batadv_orig_node_vlan_free_ref(vlan); + } spin_unlock_bh(&orig_node->vlan_list_lock); - batadv_orig_node_vlan_free_ref(vlan); } batadv_orig_node_vlan_free_ref(vlan); -- cgit v1.2.3 From 1bc4e2b000e7fa9773d6623bc8850561ce10a4fb Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Thu, 11 Feb 2016 22:15:57 +0100 Subject: batman-adv: Avoid endless loop in bat-on-bat netdevice check batman-adv checks in different situation if a new device is already on top of a different batman-adv device. This is done by getting the iflink of a device and all its parent. It assumes that this iflink is always a parent device in an acyclic graph. But this assumption is broken by devices like veth which are actually a pair of two devices linked to each other. The recursive check would therefore get veth0 when calling dev_get_iflink on veth1. And it gets veth0 when calling dev_get_iflink with veth1. Creating a veth pair and loading batman-adv freezes parts of the system ip link add veth0 type veth peer name veth1 modprobe batman-adv An RCU stall will be detected on the system which cannot be fixed. INFO: rcu_sched self-detected stall on CPU 1: (5264 ticks this GP) idle=3e9/140000000000001/0 softirq=144683/144686 fqs=5249 (t=5250 jiffies g=46 c=45 q=43) Task dump for CPU 1: insmod R running task 0 247 245 0x00000008 ffffffff8151f140 ffffffff8107888e ffff88000fd141c0 ffffffff8151f140 0000000000000000 ffffffff81552df0 ffffffff8107b420 0000000000000001 ffff88000e3fa700 ffffffff81540b00 ffffffff8107d667 0000000000000001 Call Trace: [] ? rcu_dump_cpu_stacks+0x7e/0xd0 [] ? rcu_check_callbacks+0x3f0/0x6b0 [] ? hrtimer_run_queues+0x47/0x180 [] ? update_process_times+0x2d/0x50 [] ? tick_handle_periodic+0x1b/0x60 [] ? smp_trace_apic_timer_interrupt+0x5e/0x90 [] ? apic_timer_interrupt+0x82/0x90 [] ? __dev_get_by_index+0x37/0x40 [] ? batadv_hard_if_event+0xee/0x3a0 [batman_adv] [] ? register_netdevice_notifier+0x81/0x1a0 [...] This can be avoided by checking if two devices are each others parent and stopping the check in this situation. Fixes: b7eddd0b3950 ("batman-adv: prevent using any virtual device created on batman-adv as hard-interface") Signed-off-by: Andrew Lunn [sven@narfation.org: rewritten description, extracted fix] Signed-off-by: Sven Eckelmann Signed-off-by: Marek Lindner Signed-off-by: Antonio Quartulli --- net/batman-adv/hard-interface.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'net') diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 01acccc4d218..57f7107169f5 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -75,6 +75,28 @@ out: return hard_iface; } +/** + * batadv_mutual_parents - check if two devices are each others parent + * @dev1: 1st net_device + * @dev2: 2nd net_device + * + * veth devices come in pairs and each is the parent of the other! + * + * Return: true if the devices are each others parent, otherwise false + */ +static bool batadv_mutual_parents(const struct net_device *dev1, + const struct net_device *dev2) +{ + int dev1_parent_iflink = dev_get_iflink(dev1); + int dev2_parent_iflink = dev_get_iflink(dev2); + + if (!dev1_parent_iflink || !dev2_parent_iflink) + return false; + + return (dev1_parent_iflink == dev2->ifindex) && + (dev2_parent_iflink == dev1->ifindex); +} + /** * batadv_is_on_batman_iface - check if a device is a batman iface descendant * @net_dev: the device to check @@ -108,6 +130,9 @@ static bool batadv_is_on_batman_iface(const struct net_device *net_dev) if (WARN(!parent_dev, "Cannot find parent device")) return false; + if (batadv_mutual_parents(net_dev, parent_dev)) + return false; + ret = batadv_is_on_batman_iface(parent_dev); return ret; -- cgit v1.2.3 From 1b92ee3d03af6643df395300ba7748f19ecdb0c5 Mon Sep 17 00:00:00 2001 From: Rainer Weikusat Date: Mon, 8 Feb 2016 18:47:19 +0000 Subject: af_unix: Don't set err in unix_stream_read_generic unless there was an error The present unix_stream_read_generic contains various code sequences of the form err = -EDISASTER; if () goto out; This has the unfortunate side effect of possibly causing the error code to bleed through to the final out: return copied ? : err; and then to be wrongly returned if no data was copied because the caller didn't supply a data buffer, as demonstrated by the program available at http://pad.lv/1540731 Change it such that err is only set if an error condition was detected. Fixes: 3822b5c2fc62 ("af_unix: Revert 'lock_interruptible' in stream receive code") Reported-by: Joseph Salisbury Signed-off-by: Rainer Weikusat Signed-off-by: David S. Miller --- net/unix/af_unix.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 29be035f9c65..df923caa8389 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2277,13 +2277,15 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state) size_t size = state->size; unsigned int last_len; - err = -EINVAL; - if (sk->sk_state != TCP_ESTABLISHED) + if (unlikely(sk->sk_state != TCP_ESTABLISHED)) { + err = -EINVAL; goto out; + } - err = -EOPNOTSUPP; - if (flags & MSG_OOB) + if (unlikely(flags & MSG_OOB)) { + err = -EOPNOTSUPP; goto out; + } target = sock_rcvlowat(sk, flags & MSG_WAITALL, size); timeo = sock_rcvtimeo(sk, noblock); @@ -2329,9 +2331,11 @@ again: goto unlock; unix_state_unlock(sk); - err = -EAGAIN; - if (!timeo) + if (!timeo) { + err = -EAGAIN; break; + } + mutex_unlock(&u->readlock); timeo = unix_stream_data_wait(sk, timeo, last, -- cgit v1.2.3 From a5527dda344fff0514b7989ef7a755729769daa1 Mon Sep 17 00:00:00 2001 From: Rainer Weikusat Date: Thu, 11 Feb 2016 19:37:27 +0000 Subject: af_unix: Guard against other == sk in unix_dgram_sendmsg The unix_dgram_sendmsg routine use the following test if (unlikely(unix_peer(other) != sk && unix_recvq_full(other))) { to determine if sk and other are in an n:1 association (either established via connect or by using sendto to send messages to an unrelated socket identified by address). This isn't correct as the specified address could have been bound to the sending socket itself or because this socket could have been connected to itself by the time of the unix_peer_get but disconnected before the unix_state_lock(other). In both cases, the if-block would be entered despite other == sk which might either block the sender unintentionally or lead to trying to unlock the same spin lock twice for a non-blocking send. Add a other != sk check to guard against this. Fixes: 7d267278a9ec ("unix: avoid use-after-free in ep_remove_wait_queue") Reported-By: Philipp Hahn Signed-off-by: Rainer Weikusat Tested-by: Philipp Hahn Signed-off-by: David S. Miller --- net/unix/af_unix.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index df923caa8389..c51e2831f498 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1781,7 +1781,12 @@ restart_locked: goto out_unlock; } - if (unlikely(unix_peer(other) != sk && unix_recvq_full(other))) { + /* other == sk && unix_peer(other) != sk if + * - unix_peer(sk) == NULL, destination address bound to sk + * - unix_peer(sk) == sk by time of get but disconnected before lock + */ + if (other != sk && + unlikely(unix_peer(other) != sk && unix_recvq_full(other))) { if (timeo) { timeo = unix_wait_for_peer(other, timeo); -- cgit v1.2.3 From 78565208d73ca9b654fb9a6b142214d52eeedfd1 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Tue, 9 Feb 2016 06:14:43 -0800 Subject: net: Copy inner L3 and L4 headers as unaligned on GRE TEB This patch corrects the unaligned accesses seen on GRE TEB tunnels when generating hash keys. Specifically what this patch does is make it so that we force the use of skb_copy_bits when the GRE inner headers will be unaligned due to NET_IP_ALIGNED being a non-zero value. Signed-off-by: Alexander Duyck Acked-by: Tom Herbert Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index eab81bc80e5c..12e700332010 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -399,6 +399,13 @@ ip_proto_again: goto out_bad; proto = eth->h_proto; nhoff += sizeof(*eth); + + /* Cap headers that we access via pointers at the + * end of the Ethernet header as our maximum alignment + * at that point is only 2 bytes. + */ + if (NET_IP_ALIGN) + hlen = nhoff; } key_control->flags |= FLOW_DIS_ENCAPSULATION; -- cgit v1.2.3 From 56bb7fd994f4cc163de08006bf68d959027a9f36 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 10 Feb 2016 16:09:02 +0100 Subject: bridge: mdb: avoid uninitialized variable warning A recent change to the mdb code confused the compiler to the point where it did not realize that the port-group returned from br_mdb_add_group() is always valid when the function returns a nonzero return value, so we get a spurious warning: net/bridge/br_mdb.c: In function 'br_mdb_add': net/bridge/br_mdb.c:542:4: error: 'pg' may be used uninitialized in this function [-Werror=maybe-uninitialized] __br_mdb_notify(dev, entry, RTM_NEWMDB, pg); Slightly rearranging the code in br_mdb_add_group() makes the problem go away, as gcc is clever enough to see that both functions check for 'ret != 0'. Signed-off-by: Arnd Bergmann Fixes: 9e8430f8d60d ("bridge: mdb: Passing the port-group pointer to br_mdb module") Signed-off-by: David S. Miller --- net/bridge/br_mdb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 30e105f57f0d..74c278e00225 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -425,8 +425,8 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, mp = br_mdb_ip_get(mdb, group); if (!mp) { mp = br_multicast_new_group(br, port, group); - err = PTR_ERR(mp); - if (IS_ERR(mp)) + err = PTR_ERR_OR_ZERO(mp); + if (err) return err; } -- cgit v1.2.3 From d5c91fb72f1652ea3026925240a0998a42ddb16b Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Wed, 10 Feb 2016 16:14:57 -0500 Subject: tipc: fix premature addition of node to lookup table In commit 5266698661401a ("tipc: let broadcast packet reception use new link receive function") we introduced a new per-node broadcast reception link instance. This link is created at the moment the node itself is created. Unfortunately, the allocation is done after the node instance has already been added to the node lookup hash table. This creates a potential race condition, where arriving broadcast packets are able to find and access the node before it has been fully initialized, and before the above mentioned link has been created. The result is occasional crashes in the function tipc_bcast_rcv(), which is trying to access the not-yet existing link. We fix this by deferring the addition of the node instance until after it has been fully initialized in the function tipc_node_create(). Acked-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/node.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/tipc/node.c b/net/tipc/node.c index fa97d9649a28..9d7a16fc5ca4 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -346,12 +346,6 @@ struct tipc_node *tipc_node_create(struct net *net, u32 addr, u16 capabilities) skb_queue_head_init(&n->bc_entry.inputq2); for (i = 0; i < MAX_BEARERS; i++) spin_lock_init(&n->links[i].lock); - hlist_add_head_rcu(&n->hash, &tn->node_htable[tipc_hashfn(addr)]); - list_for_each_entry_rcu(temp_node, &tn->node_list, list) { - if (n->addr < temp_node->addr) - break; - } - list_add_tail_rcu(&n->list, &temp_node->list); n->state = SELF_DOWN_PEER_LEAVING; n->signature = INVALID_NODE_SIG; n->active_links[0] = INVALID_BEARER_ID; @@ -372,6 +366,12 @@ struct tipc_node *tipc_node_create(struct net *net, u32 addr, u16 capabilities) tipc_node_get(n); setup_timer(&n->timer, tipc_node_timeout, (unsigned long)n); n->keepalive_intv = U32_MAX; + hlist_add_head_rcu(&n->hash, &tn->node_htable[tipc_hashfn(addr)]); + list_for_each_entry_rcu(temp_node, &tn->node_list, list) { + if (n->addr < temp_node->addr) + break; + } + list_add_tail_rcu(&n->list, &temp_node->list); exit: spin_unlock_bh(&tn->node_list_lock); return n; -- cgit v1.2.3 From a407054f830ca9a28febdeeeaa34d2ed420b9ed3 Mon Sep 17 00:00:00 2001 From: Sascha Hauer Date: Thu, 11 Feb 2016 11:44:49 +0100 Subject: net: dsa: remove phy_disconnect from error path The phy has not been initialized, disconnecting it in the error path results in a NULL pointer exception. Drop the phy_disconnect from the error path. Signed-off-by: Sascha Hauer Reviewed-by: Andrew Lunn Acked-by: Neil Armstrong Signed-off-by: David S. Miller --- net/dsa/slave.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 40b9ca72aae3..91e3b2ff364a 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1194,7 +1194,6 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent, if (ret) { netdev_err(master, "error %d registering interface %s\n", ret, slave_dev->name); - phy_disconnect(p->phy); ds->ports[port] = NULL; free_netdev(slave_dev); return ret; -- cgit v1.2.3 From 372022830b06d9980c7e8b41fa0a4081cff883b0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 11 Feb 2016 08:58:18 -0800 Subject: tcp: do not set rtt_min to 1 There are some cases where rtt_us derives from deltas of jiffies, instead of using usec timestamps. Since we want to track minimal rtt, better to assume a delta of 0 jiffie might be in fact be very close to 1 jiffie. It is kind of sad jiffies_to_usecs(1) calls a function instead of simply using a constant. Fixes: f672258391b42 ("tcp: track min RTT using windowed min-filter") Signed-off-by: Eric Dumazet Signed-off-by: Neal Cardwell Cc: Yuchung Cheng Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 1c2a73406261..3b2c8e90a475 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2896,7 +2896,10 @@ static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us) { const u32 now = tcp_time_stamp, wlen = sysctl_tcp_min_rtt_wlen * HZ; struct rtt_meas *m = tcp_sk(sk)->rtt_min; - struct rtt_meas rttm = { .rtt = (rtt_us ? : 1), .ts = now }; + struct rtt_meas rttm = { + .rtt = likely(rtt_us) ? rtt_us : jiffies_to_usecs(1), + .ts = now, + }; u32 elapsed; /* Check if the new measurement updates the 1st, 2nd, or 3rd choices */ -- cgit v1.2.3 From 729235554d805c63e5e274fcc6a98e71015dd847 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 11 Feb 2016 22:50:29 -0800 Subject: tcp: md5: release request socket instead of listener If tcp_v4_inbound_md5_hash() returns an error, we must release the refcount on the request socket, not on the listener. The bug was added for IPv4 only. Fixes: 079096f103fac ("tcp/dccp: install syn_recv requests into ehash table") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 7f6ff037adaf..c84477949d3a 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1600,8 +1600,10 @@ process: struct sock *nsk = NULL; sk = req->rsk_listener; - if (tcp_v4_inbound_md5_hash(sk, skb)) - goto discard_and_relse; + if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) { + reqsk_put(req); + goto discard_it; + } if (likely(sk->sk_state == TCP_LISTEN)) { nsk = tcp_check_req(sk, skb, req, false); } else { -- cgit v1.2.3 From 853effc55b0f975abd6d318cca486a9c1b67e10f Mon Sep 17 00:00:00 2001 From: Mark Tomlinson Date: Mon, 15 Feb 2016 16:24:44 +1300 Subject: l2tp: Fix error creating L2TP tunnels A previous commit (33f72e6) added notification via netlink for tunnels when created/modified/deleted. If the notification returned an error, this error was returned from the tunnel function. If there were no listeners, the error code ESRCH was returned, even though having no listeners is not an error. Other calls to this and other similar notification functions either ignore the error code, or filter ESRCH. This patch checks for ESRCH and does not flag this as an error. Reviewed-by: Hamish Martin Signed-off-by: Mark Tomlinson Signed-off-by: David S. Miller --- net/l2tp/l2tp_netlink.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c index f93c5be612a7..2caaa84ce92d 100644 --- a/net/l2tp/l2tp_netlink.c +++ b/net/l2tp/l2tp_netlink.c @@ -124,8 +124,13 @@ static int l2tp_tunnel_notify(struct genl_family *family, ret = l2tp_nl_tunnel_send(msg, info->snd_portid, info->snd_seq, NLM_F_ACK, tunnel, cmd); - if (ret >= 0) - return genlmsg_multicast_allns(family, msg, 0, 0, GFP_ATOMIC); + if (ret >= 0) { + ret = genlmsg_multicast_allns(family, msg, 0, 0, GFP_ATOMIC); + /* We don't care if no one is listening */ + if (ret == -ESRCH) + ret = 0; + return ret; + } nlmsg_free(msg); @@ -147,8 +152,13 @@ static int l2tp_session_notify(struct genl_family *family, ret = l2tp_nl_session_send(msg, info->snd_portid, info->snd_seq, NLM_F_ACK, session, cmd); - if (ret >= 0) - return genlmsg_multicast_allns(family, msg, 0, 0, GFP_ATOMIC); + if (ret >= 0) { + ret = genlmsg_multicast_allns(family, msg, 0, 0, GFP_ATOMIC); + /* We don't care if no one is listening */ + if (ret == -ESRCH) + ret = 0; + return ret; + } nlmsg_free(msg); -- cgit v1.2.3 From 73dcb556538a4192222c3a919a51e1701bae553b Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 17 Feb 2016 18:43:22 -0800 Subject: net: dsa: Unregister slave_dev in error path With commit 0071f56e46da ("dsa: Register netdev before phy"), we are now trying to free a network device that has been previously registered, and in case of errors, this will make us hit the BUG_ON(dev->reg_state != NETREG_UNREGISTERED) condition. Fix this by adding a missing unregister_netdev() before free_netdev(). Fixes: 0071f56e46da ("dsa: Register netdev before phy") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/slave.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 91e3b2ff364a..ab24521beb4d 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1204,6 +1204,7 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent, ret = dsa_slave_phy_setup(p, slave_dev); if (ret) { netdev_err(master, "error %d setting up slave phy\n", ret); + unregister_netdev(slave_dev); free_netdev(slave_dev); return ret; } -- cgit v1.2.3 From 1eea84b74cd28773c37bf1bda69915f7b9a67efc Mon Sep 17 00:00:00 2001 From: Insu Yun Date: Mon, 15 Feb 2016 21:30:33 -0500 Subject: tcp: correctly crypto_alloc_hash return check crypto_alloc_hash never returns NULL Signed-off-by: Insu Yun Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 0c36ef4a3f86..483ffdf5aa4d 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2950,7 +2950,7 @@ static void __tcp_alloc_md5sig_pool(void) struct crypto_hash *hash; hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC); - if (IS_ERR_OR_NULL(hash)) + if (IS_ERR(hash)) return; per_cpu(tcp_md5sig_pool, cpu).md5_desc.tfm = hash; } -- cgit v1.2.3 From 619fe32640b4b01f370574d50344ae0f62689816 Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Thu, 18 Feb 2016 07:38:04 -0500 Subject: net_sched fix: reclassification needs to consider ether protocol changes actions could change the etherproto in particular with ethernet tunnelled data. Typically such actions, after peeling the outer header, will ask for the packet to be reclassified. We then need to restart the classification with the new proto header. Example setup used to catch this: sudo tc qdisc add dev $ETH ingress sudo $TC filter add dev $ETH parent ffff: pref 1 protocol 802.1Q \ u32 match u32 0 0 flowid 1:1 \ action vlan pop reclassify Fixes: 3b3ae880266d ("net: sched: consolidate tc_classify{,_compat}") Signed-off-by: Jamal Hadi Salim Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/sched/sch_api.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index b5c2cf2aa6d4..af1acf009866 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1852,6 +1852,7 @@ reset: } tp = old_tp; + protocol = tc_skb_protocol(skb); goto reclassify; #endif } -- cgit v1.2.3 From deed49df7390d5239024199e249190328f1651e7 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 18 Feb 2016 21:21:19 +0800 Subject: route: check and remove route cache when we get route Since the gc of ipv4 route was removed, the route cached would has no chance to be removed, and even it has been timeout, it still could be used, cause no code to check it's expires. Fix this issue by checking and removing route cache when we get route. Signed-off-by: Xin Long Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/ipv4/route.c | 77 +++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 63 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 85f184e429c6..02c62299d717 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -129,6 +129,7 @@ static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ; static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; static int ip_rt_min_advmss __read_mostly = 256; +static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT; /* * Interface to generic destination cache. */ @@ -755,7 +756,7 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow struct fib_nh *nh = &FIB_RES_NH(res); update_or_create_fnhe(nh, fl4->daddr, new_gw, - 0, 0); + 0, jiffies + ip_rt_gc_timeout); } if (kill_route) rt->dst.obsolete = DST_OBSOLETE_KILL; @@ -1556,6 +1557,36 @@ static void ip_handle_martian_source(struct net_device *dev, #endif } +static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr) +{ + struct fnhe_hash_bucket *hash; + struct fib_nh_exception *fnhe, __rcu **fnhe_p; + u32 hval = fnhe_hashfun(daddr); + + spin_lock_bh(&fnhe_lock); + + hash = rcu_dereference_protected(nh->nh_exceptions, + lockdep_is_held(&fnhe_lock)); + hash += hval; + + fnhe_p = &hash->chain; + fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock)); + while (fnhe) { + if (fnhe->fnhe_daddr == daddr) { + rcu_assign_pointer(*fnhe_p, rcu_dereference_protected( + fnhe->fnhe_next, lockdep_is_held(&fnhe_lock))); + fnhe_flush_routes(fnhe); + kfree_rcu(fnhe, rcu); + break; + } + fnhe_p = &fnhe->fnhe_next; + fnhe = rcu_dereference_protected(fnhe->fnhe_next, + lockdep_is_held(&fnhe_lock)); + } + + spin_unlock_bh(&fnhe_lock); +} + /* called in rcu_read_lock() section */ static int __mkroute_input(struct sk_buff *skb, const struct fib_result *res, @@ -1609,11 +1640,20 @@ static int __mkroute_input(struct sk_buff *skb, fnhe = find_exception(&FIB_RES_NH(*res), daddr); if (do_cache) { - if (fnhe) + if (fnhe) { rth = rcu_dereference(fnhe->fnhe_rth_input); - else - rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input); + if (rth && rth->dst.expires && + time_after(jiffies, rth->dst.expires)) { + ip_del_fnhe(&FIB_RES_NH(*res), daddr); + fnhe = NULL; + } else { + goto rt_cache; + } + } + + rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input); +rt_cache: if (rt_cache_valid(rth)) { skb_dst_set_noref(skb, &rth->dst); goto out; @@ -2014,19 +2054,29 @@ static struct rtable *__mkroute_output(const struct fib_result *res, struct fib_nh *nh = &FIB_RES_NH(*res); fnhe = find_exception(nh, fl4->daddr); - if (fnhe) + if (fnhe) { prth = &fnhe->fnhe_rth_output; - else { - if (unlikely(fl4->flowi4_flags & - FLOWI_FLAG_KNOWN_NH && - !(nh->nh_gw && - nh->nh_scope == RT_SCOPE_LINK))) { - do_cache = false; - goto add; + rth = rcu_dereference(*prth); + if (rth && rth->dst.expires && + time_after(jiffies, rth->dst.expires)) { + ip_del_fnhe(nh, fl4->daddr); + fnhe = NULL; + } else { + goto rt_cache; } - prth = raw_cpu_ptr(nh->nh_pcpu_rth_output); } + + if (unlikely(fl4->flowi4_flags & + FLOWI_FLAG_KNOWN_NH && + !(nh->nh_gw && + nh->nh_scope == RT_SCOPE_LINK))) { + do_cache = false; + goto add; + } + prth = raw_cpu_ptr(nh->nh_pcpu_rth_output); rth = rcu_dereference(*prth); + +rt_cache: if (rt_cache_valid(rth)) { dst_hold(&rth->dst); return rth; @@ -2569,7 +2619,6 @@ void ip_rt_multicast_event(struct in_device *in_dev) } #ifdef CONFIG_SYSCTL -static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT; static int ip_rt_gc_interval __read_mostly = 60 * HZ; static int ip_rt_gc_min_interval __read_mostly = HZ / 2; static int ip_rt_gc_elasticity __read_mostly = 8; -- cgit v1.2.3 From 7716682cc58e305e22207d5bb315f26af6b1e243 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 18 Feb 2016 05:39:18 -0800 Subject: tcp/dccp: fix another race at listener dismantle Ilya reported following lockdep splat: kernel: ========================= kernel: [ BUG: held lock freed! ] kernel: 4.5.0-rc1-ceph-00026-g5e0a311 #1 Not tainted kernel: ------------------------- kernel: swapper/5/0 is freeing memory ffff880035c9d200-ffff880035c9dbff, with a lock still held there! kernel: (&(&queue->rskq_lock)->rlock){+.-...}, at: [] inet_csk_reqsk_queue_add+0x28/0xa0 kernel: 4 locks held by swapper/5/0: kernel: #0: (rcu_read_lock){......}, at: [] netif_receive_skb_internal+0x4b/0x1f0 kernel: #1: (rcu_read_lock){......}, at: [] ip_local_deliver_finish+0x3f/0x380 kernel: #2: (slock-AF_INET){+.-...}, at: [] sk_clone_lock+0x19b/0x440 kernel: #3: (&(&queue->rskq_lock)->rlock){+.-...}, at: [] inet_csk_reqsk_queue_add+0x28/0xa0 To properly fix this issue, inet_csk_reqsk_queue_add() needs to return to its callers if the child as been queued into accept queue. We also need to make sure listener is still there before calling sk->sk_data_ready(), by holding a reference on it, since the reference carried by the child can disappear as soon as the child is put on accept queue. Reported-by: Ilya Dryomov Fixes: ebb516af60e1 ("tcp/dccp: fix race at listener dismantle phase") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/dccp/ipv4.c | 14 +++++++------- net/dccp/ipv6.c | 14 +++++++------- net/ipv4/inet_connection_sock.c | 14 +++++++------- net/ipv4/tcp_ipv4.c | 14 +++++++------- net/ipv6/tcp_ipv6.c | 14 +++++++------- 5 files changed, 35 insertions(+), 35 deletions(-) (limited to 'net') diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 5684e14932bd..902d606324a0 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -824,26 +824,26 @@ lookup: if (sk->sk_state == DCCP_NEW_SYN_RECV) { struct request_sock *req = inet_reqsk(sk); - struct sock *nsk = NULL; + struct sock *nsk; sk = req->rsk_listener; - if (likely(sk->sk_state == DCCP_LISTEN)) { - nsk = dccp_check_req(sk, skb, req); - } else { + if (unlikely(sk->sk_state != DCCP_LISTEN)) { inet_csk_reqsk_queue_drop_and_put(sk, req); goto lookup; } + sock_hold(sk); + nsk = dccp_check_req(sk, skb, req); if (!nsk) { reqsk_put(req); - goto discard_it; + goto discard_and_relse; } if (nsk == sk) { - sock_hold(sk); reqsk_put(req); } else if (dccp_child_process(sk, nsk, skb)) { dccp_v4_ctl_send_reset(sk, skb); - goto discard_it; + goto discard_and_relse; } else { + sock_put(sk); return 0; } } diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 9c6d0508e63a..b8608b71a66d 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -691,26 +691,26 @@ lookup: if (sk->sk_state == DCCP_NEW_SYN_RECV) { struct request_sock *req = inet_reqsk(sk); - struct sock *nsk = NULL; + struct sock *nsk; sk = req->rsk_listener; - if (likely(sk->sk_state == DCCP_LISTEN)) { - nsk = dccp_check_req(sk, skb, req); - } else { + if (unlikely(sk->sk_state != DCCP_LISTEN)) { inet_csk_reqsk_queue_drop_and_put(sk, req); goto lookup; } + sock_hold(sk); + nsk = dccp_check_req(sk, skb, req); if (!nsk) { reqsk_put(req); - goto discard_it; + goto discard_and_relse; } if (nsk == sk) { - sock_hold(sk); reqsk_put(req); } else if (dccp_child_process(sk, nsk, skb)) { dccp_v6_ctl_send_reset(sk, skb); - goto discard_it; + goto discard_and_relse; } else { + sock_put(sk); return 0; } } diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 46b9c887bede..64148914803a 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -789,14 +789,16 @@ static void inet_child_forget(struct sock *sk, struct request_sock *req, reqsk_put(req); } -void inet_csk_reqsk_queue_add(struct sock *sk, struct request_sock *req, - struct sock *child) +struct sock *inet_csk_reqsk_queue_add(struct sock *sk, + struct request_sock *req, + struct sock *child) { struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; spin_lock(&queue->rskq_lock); if (unlikely(sk->sk_state != TCP_LISTEN)) { inet_child_forget(sk, req, child); + child = NULL; } else { req->sk = child; req->dl_next = NULL; @@ -808,6 +810,7 @@ void inet_csk_reqsk_queue_add(struct sock *sk, struct request_sock *req, sk_acceptq_added(sk); } spin_unlock(&queue->rskq_lock); + return child; } EXPORT_SYMBOL(inet_csk_reqsk_queue_add); @@ -817,11 +820,8 @@ struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child, if (own_req) { inet_csk_reqsk_queue_drop(sk, req); reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req); - inet_csk_reqsk_queue_add(sk, req, child); - /* Warning: caller must not call reqsk_put(req); - * child stole last reference on it. - */ - return child; + if (inet_csk_reqsk_queue_add(sk, req, child)) + return child; } /* Too bad, another child took ownership of the request, undo. */ bh_unlock_sock(child); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index c84477949d3a..487ac67059e2 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1597,30 +1597,30 @@ process: if (sk->sk_state == TCP_NEW_SYN_RECV) { struct request_sock *req = inet_reqsk(sk); - struct sock *nsk = NULL; + struct sock *nsk; sk = req->rsk_listener; if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) { reqsk_put(req); goto discard_it; } - if (likely(sk->sk_state == TCP_LISTEN)) { - nsk = tcp_check_req(sk, skb, req, false); - } else { + if (unlikely(sk->sk_state != TCP_LISTEN)) { inet_csk_reqsk_queue_drop_and_put(sk, req); goto lookup; } + sock_hold(sk); + nsk = tcp_check_req(sk, skb, req, false); if (!nsk) { reqsk_put(req); - goto discard_it; + goto discard_and_relse; } if (nsk == sk) { - sock_hold(sk); reqsk_put(req); } else if (tcp_child_process(sk, nsk, skb)) { tcp_v4_send_reset(nsk, skb); - goto discard_it; + goto discard_and_relse; } else { + sock_put(sk); return 0; } } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 1a5a70fb8551..5c8c84273028 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1387,7 +1387,7 @@ process: if (sk->sk_state == TCP_NEW_SYN_RECV) { struct request_sock *req = inet_reqsk(sk); - struct sock *nsk = NULL; + struct sock *nsk; sk = req->rsk_listener; tcp_v6_fill_cb(skb, hdr, th); @@ -1395,24 +1395,24 @@ process: reqsk_put(req); goto discard_it; } - if (likely(sk->sk_state == TCP_LISTEN)) { - nsk = tcp_check_req(sk, skb, req, false); - } else { + if (unlikely(sk->sk_state != TCP_LISTEN)) { inet_csk_reqsk_queue_drop_and_put(sk, req); goto lookup; } + sock_hold(sk); + nsk = tcp_check_req(sk, skb, req, false); if (!nsk) { reqsk_put(req); - goto discard_it; + goto discard_and_relse; } if (nsk == sk) { - sock_hold(sk); reqsk_put(req); tcp_v6_restore_cb(skb); } else if (tcp_child_process(sk, nsk, skb)) { tcp_v6_send_reset(nsk, skb); - goto discard_it; + goto discard_and_relse; } else { + sock_put(sk); return 0; } } -- cgit v1.2.3 From d13b161c2c7c67401bb222c30302339285ac148e Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Wed, 17 Feb 2016 15:32:53 +0100 Subject: gre: clear IFF_TX_SKB_SHARING ether_setup sets IFF_TX_SKB_SHARING but this is not supported by gre as it modifies the skb on xmit. Also, clean up whitespace in ipgre_tap_setup when we're already touching it. Signed-off-by: Jiri Benc Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 5 +++-- net/ipv6/ip6_gre.c | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 56fdf4e0dce4..41ba68de46d8 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1054,8 +1054,9 @@ static const struct net_device_ops gre_tap_netdev_ops = { static void ipgre_tap_setup(struct net_device *dev) { ether_setup(dev); - dev->netdev_ops = &gre_tap_netdev_ops; - dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + dev->netdev_ops = &gre_tap_netdev_ops; + dev->priv_flags &= ~IFF_TX_SKB_SHARING; + dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; ip_tunnel_setup(dev, gre_tap_net_id); } diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index f37f18b6b40c..a69aad1e29d1 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1512,6 +1512,7 @@ static void ip6gre_tap_setup(struct net_device *dev) dev->destructor = ip6gre_dev_free; dev->features |= NETIF_F_NETNS_LOCAL; + dev->priv_flags &= ~IFF_TX_SKB_SHARING; } static int ip6gre_newlink(struct net *src_net, struct net_device *dev, -- cgit v1.2.3 From a813104d923339144078939175faf4e66aca19b4 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Wed, 17 Feb 2016 15:37:43 +0100 Subject: IFF_NO_QUEUE: Fix for drivers not calling ether_setup() My implementation around IFF_NO_QUEUE driver flag assumed that leaving tx_queue_len untouched (specifically: not setting it to zero) by drivers would make it possible to assign a regular qdisc to them without having to worry about setting tx_queue_len to a useful value. This was only partially true: I overlooked that some drivers don't call ether_setup() and therefore not initialize tx_queue_len to the default value of 1000. Consequently, removing the workarounds in place for that case in qdisc implementations which cared about it (namely, pfifo, bfifo, gred, htb, plug and sfb) leads to problems with these specific interface types and qdiscs. Luckily, there's already a sanitization point for drivers setting tx_queue_len to zero, which can be reused to assign the fallback value most qdisc implementations used, which is 1. Fixes: 348e3435cbefa ("net: sched: drop all special handling of tx_queue_len == 0") Tested-by: Mathieu Desnoyers Signed-off-by: Phil Sutter Signed-off-by: David S. Miller --- net/core/dev.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 8cba3d852f25..e15e6e6a7a8a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7422,8 +7422,10 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; setup(dev); - if (!dev->tx_queue_len) + if (!dev->tx_queue_len) { dev->priv_flags |= IFF_NO_QUEUE; + dev->tx_queue_len = 1; + } dev->num_tx_queues = txqs; dev->real_num_tx_queues = txqs; -- cgit v1.2.3 From 48bb230e8723d7dd87928f0c0c3f6cb1fd5bc9be Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Wed, 17 Feb 2016 10:53:59 -0500 Subject: appletalk: fix erroneous return value The atalk_sendmsg() function might return wrong value ENETUNREACH instead of -ENETUNREACH. Signed-off-by: Anton Protopopov Signed-off-by: David S. Miller --- net/appletalk/ddp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index d5871ac493eb..f066781be3c8 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -1625,7 +1625,7 @@ static int atalk_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) rt = atrtr_find(&at_hint); } - err = ENETUNREACH; + err = -ENETUNREACH; if (!rt) goto out; -- cgit v1.2.3 From 449f14f01f65f45f332e3360aa46b3d3571b2cba Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Wed, 17 Feb 2016 10:54:13 -0500 Subject: net: caif: fix erroneous return value The cfrfml_receive() function might return positive value EPROTO Signed-off-by: Anton Protopopov Signed-off-by: David S. Miller --- net/caif/cfrfml.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/caif/cfrfml.c b/net/caif/cfrfml.c index 61d7617d9249..b82440e1fcb4 100644 --- a/net/caif/cfrfml.c +++ b/net/caif/cfrfml.c @@ -159,7 +159,7 @@ static int cfrfml_receive(struct cflayer *layr, struct cfpkt *pkt) tmppkt = NULL; /* Verify that length is correct */ - err = EPROTO; + err = -EPROTO; if (rfml->pdu_size != cfpkt_getlen(pkt) - RFM_HEAD_SIZE + 1) goto out; } -- cgit v1.2.3 From cfdd28beb3205dbd1e91571516807199c8ab84ca Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Wed, 17 Feb 2016 18:00:31 +0100 Subject: net: make netdev_for_each_lower_dev safe for device removal When I used netdev_for_each_lower_dev in commit bad531623253 ("vrf: remove slave queue and private slave struct") I thought that it acts like netdev_for_each_lower_private and can be used to remove the current device from the list while walking, but unfortunately it acts more like netdev_for_each_lower_private_rcu and doesn't allow it. The difference is where the "iter" points to, right now it points to the current element and that makes it impossible to remove it. Change the logic to be similar to netdev_for_each_lower_private and make it point to the "next" element so we can safely delete the current one. VRF is the only such user right now, there's no change for the read-only users. Here's what can happen now: [98423.249858] general protection fault: 0000 [#1] SMP [98423.250175] Modules linked in: vrf bridge(O) stp llc nfsd auth_rpcgss oid_registry nfs_acl nfs lockd grace sunrpc crct10dif_pclmul crc32_pclmul crc32c_intel ghash_clmulni_intel jitterentropy_rng sha256_generic hmac drbg ppdev aesni_intel aes_x86_64 glue_helper lrw gf128mul ablk_helper cryptd evdev serio_raw pcspkr virtio_balloon parport_pc parport i2c_piix4 i2c_core virtio_console acpi_cpufreq button 9pnet_virtio 9p 9pnet fscache ipv6 autofs4 ext4 crc16 mbcache jbd2 sg virtio_blk virtio_net sr_mod cdrom e1000 ata_generic ehci_pci uhci_hcd ehci_hcd usbcore usb_common virtio_pci ata_piix libata floppy virtio_ring virtio scsi_mod [last unloaded: bridge] [98423.255040] CPU: 1 PID: 14173 Comm: ip Tainted: G O 4.5.0-rc2+ #81 [98423.255386] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.8.1-20150318_183358- 04/01/2014 [98423.255777] task: ffff8800547f5540 ti: ffff88003428c000 task.ti: ffff88003428c000 [98423.256123] RIP: 0010:[] [] netdev_lower_get_next+0x1e/0x30 [98423.256534] RSP: 0018:ffff88003428f940 EFLAGS: 00010207 [98423.256766] RAX: 0002000100000004 RBX: ffff880054ff9000 RCX: 0000000000000000 [98423.257039] RDX: ffff88003428f8b8 RSI: ffff88003428f950 RDI: ffff880054ff90c0 [98423.257287] RBP: ffff88003428f940 R08: 0000000000000000 R09: 0000000000000000 [98423.257537] R10: 0000000000000001 R11: 0000000000000000 R12: ffff88003428f9e0 [98423.257802] R13: ffff880054a5fd00 R14: ffff88003428f970 R15: 0000000000000001 [98423.258055] FS: 00007f3d76881700(0000) GS:ffff88005d000000(0000) knlGS:0000000000000000 [98423.258418] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [98423.258650] CR2: 00007ffe5951ffa8 CR3: 0000000052077000 CR4: 00000000000406e0 [98423.258902] Stack: [98423.259075] ffff88003428f960 ffffffffa0442636 0002000100000004 ffff880054ff9000 [98423.259647] ffff88003428f9b0 ffffffff81518205 ffff880054ff9000 ffff88003428f978 [98423.260208] ffff88003428f978 ffff88003428f9e0 ffff88003428f9e0 ffff880035b35f00 [98423.260739] Call Trace: [98423.260920] [] vrf_dev_uninit+0x76/0xa0 [vrf] [98423.261156] [] rollback_registered_many+0x205/0x390 [98423.261401] [] unregister_netdevice_many+0x1c/0x70 [98423.261641] [] rtnl_delete_link+0x3c/0x50 [98423.271557] [] rtnl_dellink+0xcb/0x1d0 [98423.271800] [] ? __inc_zone_state+0x4a/0x90 [98423.272049] [] rtnetlink_rcv_msg+0x84/0x200 [98423.272279] [] ? trace_hardirqs_on+0xd/0x10 [98423.272513] [] ? rtnetlink_rcv+0x1b/0x40 [98423.272755] [] ? rtnetlink_rcv+0x40/0x40 [98423.272983] [] netlink_rcv_skb+0x97/0xb0 [98423.273209] [] rtnetlink_rcv+0x2a/0x40 [98423.273476] [] netlink_unicast+0x11b/0x1a0 [98423.273710] [] netlink_sendmsg+0x3e1/0x610 [98423.273947] [] sock_sendmsg+0x38/0x70 [98423.274175] [] ___sys_sendmsg+0x2e3/0x2f0 [98423.274416] [] ? do_raw_spin_unlock+0xbe/0x140 [98423.274658] [] ? handle_mm_fault+0x26c/0x2210 [98423.274894] [] ? handle_mm_fault+0x4d/0x2210 [98423.275130] [] ? __fget_light+0x91/0xb0 [98423.275365] [] __sys_sendmsg+0x42/0x80 [98423.275595] [] SyS_sendmsg+0x12/0x20 [98423.275827] [] entry_SYSCALL_64_fastpath+0x16/0x7a [98423.276073] Code: c3 31 c0 5d c3 0f 1f 84 00 00 00 00 00 66 66 66 66 90 48 8b 06 55 48 81 c7 c0 00 00 00 48 89 e5 48 8b 00 48 39 f8 74 09 48 89 06 <48> 8b 40 e8 5d c3 31 c0 5d c3 0f 1f 84 00 00 00 00 00 66 66 66 [98423.279639] RIP [] netdev_lower_get_next+0x1e/0x30 [98423.279920] RSP CC: David Ahern CC: David S. Miller CC: Roopa Prabhu CC: Vlad Yasevich Fixes: bad531623253 ("vrf: remove slave queue and private slave struct") Signed-off-by: Nikolay Aleksandrov Reviewed-by: David Ahern Tested-by: David Ahern Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index e15e6e6a7a8a..0ef061b2badc 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5379,12 +5379,12 @@ void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter) { struct netdev_adjacent *lower; - lower = list_entry((*iter)->next, struct netdev_adjacent, list); + lower = list_entry(*iter, struct netdev_adjacent, list); if (&lower->list == &dev->adj_list.lower) return NULL; - *iter = &lower->list; + *iter = lower->list.next; return lower->dev; } -- cgit v1.2.3 From a97eb33ff225f34a8124774b3373fd244f0e83ce Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Tue, 16 Feb 2016 21:43:16 -0500 Subject: rtnl: RTM_GETNETCONF: fix wrong return value An error response from a RTM_GETNETCONF request can return the positive error value EINVAL in the struct nlmsgerr that can mislead userspace. Signed-off-by: Anton Protopopov Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/ipv4/devinet.c | 2 +- net/ipv6/addrconf.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index cebd9d31e65a..f6303b17546b 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1847,7 +1847,7 @@ static int inet_netconf_get_devconf(struct sk_buff *in_skb, if (err < 0) goto errout; - err = EINVAL; + err = -EINVAL; if (!tb[NETCONFA_IFINDEX]) goto errout; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 9efd9ffdc34c..bdd7eac4307a 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -583,7 +583,7 @@ static int inet6_netconf_get_devconf(struct sk_buff *in_skb, if (err < 0) goto errout; - err = EINVAL; + err = -EINVAL; if (!tb[NETCONFA_IFINDEX]) goto errout; -- cgit v1.2.3 From b53ce3e7d407aa4196877a48b8601181162ab158 Mon Sep 17 00:00:00 2001 From: Insu Yun Date: Wed, 17 Feb 2016 11:47:35 -0500 Subject: tipc: unlock in error path tipc_bcast_unlock need to be unlocked in error path. Signed-off-by: Insu Yun Signed-off-by: David S. Miller --- net/tipc/link.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/tipc/link.c b/net/tipc/link.c index 0c2944fb9ae0..347cdc99ed09 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1973,8 +1973,10 @@ int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg) hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, NLM_F_MULTI, TIPC_NL_LINK_GET); - if (!hdr) + if (!hdr) { + tipc_bcast_unlock(net); return -EMSGSIZE; + } attrs = nla_nest_start(msg->skb, TIPC_NLA_LINK); if (!attrs) -- cgit v1.2.3 From c868ee7063bdb53f3ef9eac7bcec84960980b471 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 17 Feb 2016 19:30:01 +0100 Subject: lwt: fix rx checksum setting for lwt devices tunneling over ipv6 the commit 35e2d1152b22 ("tunnels: Allow IPv6 UDP checksums to be correctly controlled.") changed the default xmit checksum setting for lwt vxlan/geneve ipv6 tunnels, so that now the checksum is not set into external UDP header. This commit changes the rx checksum setting for both lwt vxlan/geneve devices created by openvswitch accordingly, so that lwt over ipv6 tunnel pairs are again able to communicate with default values. Signed-off-by: Paolo Abeni Acked-by: Jiri Benc Acked-by: Jesse Gross Signed-off-by: David S. Miller --- net/openvswitch/vport-vxlan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c index de9cb19efb6a..5eb7694348b5 100644 --- a/net/openvswitch/vport-vxlan.c +++ b/net/openvswitch/vport-vxlan.c @@ -90,7 +90,7 @@ static struct vport *vxlan_tnl_create(const struct vport_parms *parms) int err; struct vxlan_config conf = { .no_share = true, - .flags = VXLAN_F_COLLECT_METADATA, + .flags = VXLAN_F_COLLECT_METADATA | VXLAN_F_UDP_ZERO_CSUM6_RX, /* Don't restrict the packets that can be sent by MTU */ .mtu = IP_MAX_MTU, }; -- cgit v1.2.3 From b5f0549231ffb025337be5a625b0ff9f52b016f0 Mon Sep 17 00:00:00 2001 From: "Dmitry V. Levin" Date: Fri, 19 Feb 2016 04:27:48 +0300 Subject: unix_diag: fix incorrect sign extension in unix_lookup_by_ino The value passed by unix_diag_get_exact to unix_lookup_by_ino has type __u32, but unix_lookup_by_ino's argument ino has type int, which is not a problem yet. However, when ino is compared with sock_i_ino return value of type unsigned long, ino is sign extended to signed long, and this results to incorrect comparison on 64-bit architectures for inode numbers greater than INT_MAX. This bug was found by strace test suite. Fixes: 5d3cae8bc39d ("unix_diag: Dumping exact socket core") Signed-off-by: Dmitry V. Levin Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/unix/diag.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/unix/diag.c b/net/unix/diag.c index c512f64d5287..4d9679701a6d 100644 --- a/net/unix/diag.c +++ b/net/unix/diag.c @@ -220,7 +220,7 @@ done: return skb->len; } -static struct sock *unix_lookup_by_ino(int ino) +static struct sock *unix_lookup_by_ino(unsigned int ino) { int i; struct sock *sk; -- cgit v1.2.3 From 18eceb818dc37bbc783ec7ef7703f270cc6cd281 Mon Sep 17 00:00:00 2001 From: Rainer Weikusat Date: Thu, 18 Feb 2016 12:39:46 +0000 Subject: af_unix: Don't use continue to re-execute unix_stream_read_generic loop The unix_stream_read_generic function tries to use a continue statement to restart the receive loop after waiting for a message. This may not work as intended as the caller might use a recvmsg call to peek at control messages without specifying a message buffer. If this was the case, the continue will cause the function to return without an error and without the credential information if the function had to wait for a message while it had returned with the credentials otherwise. Change to using goto to restart the loop without checking the condition first in this case so that credentials are returned either way. Signed-off-by: Rainer Weikusat Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/unix/af_unix.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index c51e2831f498..f75f847e688d 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2312,6 +2312,7 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state) bool drop_skb; struct sk_buff *skb, *last; +redo: unix_state_lock(sk); if (sock_flag(sk, SOCK_DEAD)) { err = -ECONNRESET; @@ -2353,7 +2354,7 @@ again: } mutex_lock(&u->readlock); - continue; + goto redo; unlock: unix_state_unlock(sk); break; -- cgit v1.2.3 From 3bd7594e69bd97c962faa6a5ae15dd8c6c082636 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Fri, 19 Feb 2016 14:25:21 -0800 Subject: Bluetooth: hci_core: Avoid mixing up req_complete and req_complete_skb In commit 44d271377479 ("Bluetooth: Compress the size of struct hci_ctrl") we squashed down the size of the structure by using a union with the assumption that all users would use the flag to determine whether we had a req_complete or a req_complete_skb. Unfortunately we had a case in hci_req_cmd_complete() where we weren't looking at the flag. This can result in a situation where we might be storing a hci_req_complete_skb_t in a hci_req_complete_t variable, or vice versa. During some testing I found at least one case where the function hci_req_sync_complete() was called improperly because the kernel thought that it didn't require an SKB. Looking through the stack in kgdb I found that it was called by hci_event_packet() and that hci_event_packet() had both of its locals "req_complete" and "req_complete_skb" pointing to the same place: both to hci_req_sync_complete(). Let's make sure we always check the flag. For more details on debugging done, see . Fixes: 44d271377479 ("Bluetooth: Compress the size of struct hci_ctrl") Signed-off-by: Douglas Anderson Acked-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_core.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 47bcef754796..883c821a9e78 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -4112,8 +4112,10 @@ void hci_req_cmd_complete(struct hci_dev *hdev, u16 opcode, u8 status, break; } - *req_complete = bt_cb(skb)->hci.req_complete; - *req_complete_skb = bt_cb(skb)->hci.req_complete_skb; + if (bt_cb(skb)->hci.req_flags & HCI_REQ_SKB) + *req_complete_skb = bt_cb(skb)->hci.req_complete_skb; + else + *req_complete = bt_cb(skb)->hci.req_complete; kfree_skb(skb); } spin_unlock_irqrestore(&hdev->cmd_q.lock, flags); -- cgit v1.2.3 From d9749fb5942f51555dc9ce1ac0dbb1806960a975 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Thu, 18 Feb 2016 16:10:57 -0500 Subject: sctp: Fix port hash table size computation Dmitry Vyukov noted recently that the sctp_port_hashtable had an error in its size computation, observing that the current method never guaranteed that the hashsize (measured in number of entries) would be a power of two, which the input hash function for that table requires. The root cause of the problem is that two values need to be computed (one, the allocation order of the storage requries, as passed to __get_free_pages, and two the number of entries for the hash table). Both need to be ^2, but for different reasons, and the existing code is simply computing one order value, and using it as the basis for both, which is wrong (i.e. it assumes that ((1< Reported-by: Dmitry Vyukov CC: Dmitry Vyukov CC: Vladislav Yasevich CC: "David S. Miller" Signed-off-by: David S. Miller --- net/sctp/protocol.c | 46 ++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 38 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index ab0d538a74ed..1099e99a53c4 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -60,6 +60,8 @@ #include #include +#define MAX_SCTP_PORT_HASH_ENTRIES (64 * 1024) + /* Global data structures. */ struct sctp_globals sctp_globals __read_mostly; @@ -1355,6 +1357,8 @@ static __init int sctp_init(void) unsigned long limit; int max_share; int order; + int num_entries; + int max_entry_order; sock_skb_cb_check_size(sizeof(struct sctp_ulpevent)); @@ -1407,14 +1411,24 @@ static __init int sctp_init(void) /* Size and allocate the association hash table. * The methodology is similar to that of the tcp hash tables. + * Though not identical. Start by getting a goal size */ if (totalram_pages >= (128 * 1024)) goal = totalram_pages >> (22 - PAGE_SHIFT); else goal = totalram_pages >> (24 - PAGE_SHIFT); - for (order = 0; (1UL << order) < goal; order++) - ; + /* Then compute the page order for said goal */ + order = get_order(goal); + + /* Now compute the required page order for the maximum sized table we + * want to create + */ + max_entry_order = get_order(MAX_SCTP_PORT_HASH_ENTRIES * + sizeof(struct sctp_bind_hashbucket)); + + /* Limit the page order by that maximum hash table size */ + order = min(order, max_entry_order); /* Allocate and initialize the endpoint hash table. */ sctp_ep_hashsize = 64; @@ -1430,20 +1444,35 @@ static __init int sctp_init(void) INIT_HLIST_HEAD(&sctp_ep_hashtable[i].chain); } - /* Allocate and initialize the SCTP port hash table. */ + /* Allocate and initialize the SCTP port hash table. + * Note that order is initalized to start at the max sized + * table we want to support. If we can't get that many pages + * reduce the order and try again + */ do { - sctp_port_hashsize = (1UL << order) * PAGE_SIZE / - sizeof(struct sctp_bind_hashbucket); - if ((sctp_port_hashsize > (64 * 1024)) && order > 0) - continue; sctp_port_hashtable = (struct sctp_bind_hashbucket *) __get_free_pages(GFP_KERNEL | __GFP_NOWARN, order); } while (!sctp_port_hashtable && --order > 0); + if (!sctp_port_hashtable) { pr_err("Failed bind hash alloc\n"); status = -ENOMEM; goto err_bhash_alloc; } + + /* Now compute the number of entries that will fit in the + * port hash space we allocated + */ + num_entries = (1UL << order) * PAGE_SIZE / + sizeof(struct sctp_bind_hashbucket); + + /* And finish by rounding it down to the nearest power of two + * this wastes some memory of course, but its needed because + * the hash function operates based on the assumption that + * that the number of entries is a power of two + */ + sctp_port_hashsize = rounddown_pow_of_two(num_entries); + for (i = 0; i < sctp_port_hashsize; i++) { spin_lock_init(&sctp_port_hashtable[i].lock); INIT_HLIST_HEAD(&sctp_port_hashtable[i].chain); @@ -1452,7 +1481,8 @@ static __init int sctp_init(void) if (sctp_transport_hashtable_init()) goto err_thash_alloc; - pr_info("Hash tables configured (bind %d)\n", sctp_port_hashsize); + pr_info("Hash tables configured (bind %d/%d)\n", sctp_port_hashsize, + num_entries); sctp_sysctl_register(); -- cgit v1.2.3