From a399a8053164ec8bcb06fed52be9941a26ecde11 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 8 Aug 2012 21:13:53 +0000 Subject: time: jiffies_delta_to_clock_t() helper to the rescue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Various /proc/net files sometimes report crazy timer values, expressed in clock_t units. This happens when an expired timer delta (expires - jiffies) is passed to jiffies_to_clock_t(). This function has an overflow in : return div_u64((u64)x * TICK_NSEC, NSEC_PER_SEC / USER_HZ); commit cbbc719fccdb8cb (time: Change jiffies_to_clock_t() argument type to unsigned long) only got around the problem. As we cant output negative values in /proc/net/tcp without breaking various tools, I suggest adding a jiffies_delta_to_clock_t() wrapper that caps the negative delta to a 0 value. Signed-off-by: Eric Dumazet Reported-by: Maciej Żenczykowski Cc: Thomas Gleixner Cc: Paul Gortmaker Cc: Andrew Morton Cc: hank Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 2c5a0a06c4ce..db037c9a4c48 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -618,7 +618,7 @@ int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, long expires, u32 error) { struct rta_cacheinfo ci = { - .rta_lastuse = jiffies_to_clock_t(jiffies - dst->lastuse), + .rta_lastuse = jiffies_delta_to_clock_t(jiffies - dst->lastuse), .rta_used = dst->__use, .rta_clntref = atomic_read(&(dst->__refcnt)), .rta_error = error, -- cgit v1.2.3 From 9c7dafbfab1554705f85523fead578aa1a3d338c Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 8 Aug 2012 21:52:46 +0000 Subject: net: Allow to create links with given ifindex Currently the RTM_NEWLINK results in -EOPNOTSUPP if the ifinfomsg->ifi_index is not zero. I propose to allow requesting ifindices on link creation. This is required by the checkpoint-restore to correctly restore a net namespace (i.e. -- a container). Signed-off-by: Pavel Emelyanov Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 7 ++++++- net/core/rtnetlink.c | 12 +++++++----- 2 files changed, 13 insertions(+), 6 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index f91abf800161..3ca300d85271 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5579,7 +5579,12 @@ int register_netdevice(struct net_device *dev) } } - dev->ifindex = dev_new_index(net); + ret = -EBUSY; + if (!dev->ifindex) + dev->ifindex = dev_new_index(net); + else if (__dev_get_by_index(net, dev->ifindex)) + goto err_uninit; + if (dev->iflink == -1) dev->iflink = dev->ifindex; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index db037c9a4c48..34d975b0f277 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1812,8 +1812,6 @@ replay: return -ENODEV; } - if (ifm->ifi_index) - return -EOPNOTSUPP; if (tb[IFLA_MAP] || tb[IFLA_MASTER] || tb[IFLA_PROTINFO]) return -EOPNOTSUPP; @@ -1839,10 +1837,14 @@ replay: return PTR_ERR(dest_net); dev = rtnl_create_link(net, dest_net, ifname, ops, tb); - - if (IS_ERR(dev)) + if (IS_ERR(dev)) { err = PTR_ERR(dev); - else if (ops->newlink) + goto out; + } + + dev->ifindex = ifm->ifi_index; + + if (ops->newlink) err = ops->newlink(net, dev, tb, data); else err = register_netdevice(dev); -- cgit v1.2.3 From aa79e66eee5d525e2fcbd2a5fcb87ae3dd4aa9e9 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 8 Aug 2012 21:53:19 +0000 Subject: net: Make ifindex generation per-net namespace Strictly speaking this is only _really_ required for checkpoint-restore to make loopback device always have the same index. This change appears to be safe wrt "ifindex should be unique per-system" concept, as all the ifindex usage is either already made per net namespace of is explicitly limited with init_net only. There are two cool side effects of this. The first one -- ifindices of devices in container are always small, regardless of how many containers we've started (and re-started) so far. The second one is -- we can speed up the loopback ifidex access as shown in the next patch. v2: Place ifindex right after dev_base_seq : avoid two holes and use the same cache line, dirtied in list_netdevice()/unlist_netdevice() Signed-off-by: Pavel Emelyanov Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 3ca300d85271..1f06df8d10a3 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5221,12 +5221,12 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) */ static int dev_new_index(struct net *net) { - static int ifindex; + int ifindex = net->ifindex; for (;;) { if (++ifindex <= 0) ifindex = 1; if (!__dev_get_by_index(net, ifindex)) - return ifindex; + return net->ifindex = ifindex; } } -- cgit v1.2.3 From ee89bab14e857678f83a71ee99e575b0fdbb58d4 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Thu, 9 Aug 2012 22:14:56 +0000 Subject: net: move and rename netif_notify_peers() I believe net/core/dev.c is a better place for netif_notify_peers(), because other net event notify functions also stay in this file. And rename it to netdev_notify_peers(). Cc: David S. Miller Cc: Ian Campbell Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/core/dev.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 1f06df8d10a3..5defcf940005 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1106,6 +1106,24 @@ void netdev_state_change(struct net_device *dev) } EXPORT_SYMBOL(netdev_state_change); +/** + * netdev_notify_peers - notify network peers about existence of @dev + * @dev: network device + * + * Generate traffic such that interested network peers are aware of + * @dev, such as by generating a gratuitous ARP. This may be used when + * a device wants to inform the rest of the network about some sort of + * reconfiguration such as a failover event or virtual machine + * migration. + */ +void netdev_notify_peers(struct net_device *dev) +{ + rtnl_lock(); + call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev); + rtnl_unlock(); +} +EXPORT_SYMBOL(netdev_notify_peers); + int netdev_bonding_change(struct net_device *dev, unsigned long event) { return call_netdevice_notifiers(event, dev); -- cgit v1.2.3 From b7bc2a5b5bd99b216c3e5fe68c7f45c684ab5745 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Thu, 9 Aug 2012 22:14:57 +0000 Subject: net: remove netdev_bonding_change() I don't see any benifits to use netdev_bonding_change() than using call_netdevice_notifiers() directly. Cc: David S. Miller Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/core/dev.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 5defcf940005..ce1bccb08de5 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1124,12 +1124,6 @@ void netdev_notify_peers(struct net_device *dev) } EXPORT_SYMBOL(netdev_notify_peers); -int netdev_bonding_change(struct net_device *dev, unsigned long event) -{ - return call_netdevice_notifiers(event, dev); -} -EXPORT_SYMBOL(netdev_bonding_change); - /** * dev_load - load a network module * @net: the applicable net namespace -- cgit v1.2.3 From 0115e8e30d6fcdd4b8faa30d3ffd90859a591f51 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 22 Aug 2012 17:19:46 +0000 Subject: net: remove delay at device dismantle I noticed extra one second delay in device dismantle, tracked down to a call to dst_dev_event() while some call_rcu() are still in RCU queues. These call_rcu() were posted by rt_free(struct rtable *rt) calls. We then wait a little (but one second) in netdev_wait_allrefs() before kicking again NETDEV_UNREGISTER. As the call_rcu() are now completed, dst_dev_event() can do the needed device swap on busy dst. To solve this problem, add a new NETDEV_UNREGISTER_FINAL, called after a rcu_barrier(), but outside of RTNL lock. Use NETDEV_UNREGISTER_FINAL with care ! Change dst_dev_event() handler to react to NETDEV_UNREGISTER_FINAL Also remove NETDEV_UNREGISTER_BATCH, as its not used anymore after IP cache removal. With help from Gao feng Signed-off-by: Eric Dumazet Cc: Tom Herbert Cc: Mahesh Bandewar Cc: "Eric W. Biederman" Cc: Gao feng Signed-off-by: David S. Miller --- net/core/dev.c | 22 ++++++++-------------- net/core/dst.c | 2 +- net/core/fib_rules.c | 3 ++- net/core/rtnetlink.c | 2 +- 4 files changed, 12 insertions(+), 17 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 088923fe4066..0640d2a859c6 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1406,7 +1406,6 @@ rollback: nb->notifier_call(nb, NETDEV_DOWN, dev); } nb->notifier_call(nb, NETDEV_UNREGISTER, dev); - nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev); } } @@ -1448,7 +1447,6 @@ int unregister_netdevice_notifier(struct notifier_block *nb) nb->notifier_call(nb, NETDEV_DOWN, dev); } nb->notifier_call(nb, NETDEV_UNREGISTER, dev); - nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev); } } unlock: @@ -1468,7 +1466,8 @@ EXPORT_SYMBOL(unregister_netdevice_notifier); int call_netdevice_notifiers(unsigned long val, struct net_device *dev) { - ASSERT_RTNL(); + if (val != NETDEV_UNREGISTER_FINAL) + ASSERT_RTNL(); return raw_notifier_call_chain(&netdev_chain, val, dev); } EXPORT_SYMBOL(call_netdevice_notifiers); @@ -5331,10 +5330,6 @@ static void rollback_registered_many(struct list_head *head) netdev_unregister_kobject(dev); } - /* Process any work delayed until the end of the batch */ - dev = list_first_entry(head, struct net_device, unreg_list); - call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev); - synchronize_net(); list_for_each_entry(dev, head, unreg_list) @@ -5787,9 +5782,8 @@ static void netdev_wait_allrefs(struct net_device *dev) /* Rebroadcast unregister notification */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); - /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users - * should have already handle it the first time */ - + rcu_barrier(); + call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); if (test_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state)) { /* We must not have linkwatch events @@ -5851,9 +5845,8 @@ void netdev_run_todo(void) __rtnl_unlock(); - /* Wait for rcu callbacks to finish before attempting to drain - * the device list. This usually avoids a 250ms wait. - */ + + /* Wait for rcu callbacks to finish before next phase */ if (!list_empty(&list)) rcu_barrier(); @@ -5862,6 +5855,8 @@ void netdev_run_todo(void) = list_first_entry(&list, struct net_device, todo_list); list_del(&dev->todo_list); + call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); + if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) { pr_err("network todo '%s' but state %d\n", dev->name, dev->reg_state); @@ -6256,7 +6251,6 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char the device is just moving and can keep their slaves up. */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); - call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev); rtmsg_ifinfo(RTM_DELLINK, dev, ~0U); /* diff --git a/net/core/dst.c b/net/core/dst.c index 56d63612e1e4..f6593d238e9a 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -374,7 +374,7 @@ static int dst_dev_event(struct notifier_block *this, unsigned long event, struct dst_entry *dst, *last = NULL; switch (event) { - case NETDEV_UNREGISTER: + case NETDEV_UNREGISTER_FINAL: case NETDEV_DOWN: mutex_lock(&dst_gc_mutex); for (dst = dst_busy_list; dst; dst = dst->next) { diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index ab7db83236c9..585093755c23 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -711,15 +711,16 @@ static int fib_rules_event(struct notifier_block *this, unsigned long event, struct net *net = dev_net(dev); struct fib_rules_ops *ops; - ASSERT_RTNL(); switch (event) { case NETDEV_REGISTER: + ASSERT_RTNL(); list_for_each_entry(ops, &net->rules_ops, list) attach_rules(&ops->rules_list, dev); break; case NETDEV_UNREGISTER: + ASSERT_RTNL(); list_for_each_entry(ops, &net->rules_ops, list) detach_rules(&ops->rules_list, dev); break; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 34d975b0f277..c64efcff8078 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2358,7 +2358,7 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi case NETDEV_PRE_TYPE_CHANGE: case NETDEV_GOING_DOWN: case NETDEV_UNREGISTER: - case NETDEV_UNREGISTER_BATCH: + case NETDEV_UNREGISTER_FINAL: case NETDEV_RELEASE: case NETDEV_JOIN: break; -- cgit v1.2.3 From 748e2d9396a18c3fd3d07d47c1b41320acf1fbf4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 22 Aug 2012 21:50:59 +0000 Subject: net: reinstate rtnl in call_netdevice_notifiers() Eric Biederman pointed out that not holding RTNL while calling call_netdevice_notifiers() was racy. This patch is a direct transcription his feedback against commit 0115e8e30d6fc (net: remove delay at device dismantle) Thanks Eric ! Signed-off-by: Eric Dumazet Cc: Tom Herbert Cc: Mahesh Bandewar Cc: "Eric W. Biederman" Cc: Gao feng Acked-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- net/core/dev.c | 9 +++++++-- net/core/fib_rules.c | 3 +-- 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 0640d2a859c6..bc857fead8c8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1466,8 +1466,7 @@ EXPORT_SYMBOL(unregister_netdevice_notifier); int call_netdevice_notifiers(unsigned long val, struct net_device *dev) { - if (val != NETDEV_UNREGISTER_FINAL) - ASSERT_RTNL(); + ASSERT_RTNL(); return raw_notifier_call_chain(&netdev_chain, val, dev); } EXPORT_SYMBOL(call_netdevice_notifiers); @@ -5782,7 +5781,11 @@ static void netdev_wait_allrefs(struct net_device *dev) /* Rebroadcast unregister notification */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); + + __rtnl_unlock(); rcu_barrier(); + rtnl_lock(); + call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); if (test_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state)) { @@ -5855,7 +5858,9 @@ void netdev_run_todo(void) = list_first_entry(&list, struct net_device, todo_list); list_del(&dev->todo_list); + rtnl_lock(); call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); + __rtnl_unlock(); if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) { pr_err("network todo '%s' but state %d\n", diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 585093755c23..ab7db83236c9 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -711,16 +711,15 @@ static int fib_rules_event(struct notifier_block *this, unsigned long event, struct net *net = dev_net(dev); struct fib_rules_ops *ops; + ASSERT_RTNL(); switch (event) { case NETDEV_REGISTER: - ASSERT_RTNL(); list_for_each_entry(ops, &net->rules_ops, list) attach_rules(&ops->rules_list, dev); break; case NETDEV_UNREGISTER: - ASSERT_RTNL(); list_for_each_entry(ops, &net->rules_ops, list) detach_rules(&ops->rules_list, dev); break; -- cgit v1.2.3 From 3afa6d00fb4f9712fbb44b63ba31f88b6f9239fe Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Mon, 20 Aug 2012 07:59:10 +0000 Subject: cls_cgroup: Allow classifier cgroups to have their classid reset to 0 The network classifier cgroup initalizes each cgroups instance classid value to 0. However, the sock_update_classid function only updates classid's in sockets if the tasks cgroup classid is not zero, and if it differs from the current classid. The later check is to prevent cache line dirtying, but the former is detrimental, as it prevents resetting a classid for a cgroup to 0. While this is not a common action, it has administrative usefulness (if the admin wants to disable classification of a certain group temporarily for instance). Easy fix, just remove the zero check. Tested successfully by myself Signed-off-by: Neil Horman CC: "David S. Miller" Signed-off-by: David S. Miller --- net/core/sock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index 8f67ced8d6a8..116786c55fe9 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1230,7 +1230,7 @@ void sock_update_classid(struct sock *sk) rcu_read_lock(); /* doing current task, which cannot vanish. */ classid = task_cls_classid(current); rcu_read_unlock(); - if (classid && classid != sk->sk_classid) + if (classid != sk->sk_classid) sk->sk_classid = classid; } EXPORT_SYMBOL(sock_update_classid); -- cgit v1.2.3 From 8f4cccbbd92f2ad0ddbbc498ef7cee2a1c3defe9 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Mon, 20 Aug 2012 22:16:51 +0100 Subject: net: Set device operstate at registration time The operstate of a device is initially IF_OPER_UNKNOWN and is updated asynchronously by linkwatch after each change of carrier state reported by the driver. The default carrier state of a net device is on, and this will never be changed on drivers that do not support carrier detection, thus the operstate remains IF_OPER_UNKNOWN. For devices that do support carrier detection, the driver must set the carrier state to off initially, then poll the hardware state when the device is opened. However, we must not activate linkwatch for a unregistered device, and commit b473001 ('net: Do not fire linkwatch events until the device is registered.') ensured that we don't. But this means that the operstate for many devices that support carrier detection remains IF_OPER_UNKNOWN when it should be IF_OPER_DOWN. The same issue exists with the dormant state. The proper initialisation sequence, avoiding a race with opening of the device, is: rtnl_lock(); rc = register_netdevice(dev); if (rc) goto out_unlock; netif_carrier_off(dev); /* or netif_dormant_on(dev) */ rtnl_unlock(); but it seems silly that this should have to be repeated in so many drivers. Further, the operstate seen immediately after opening the device may still be IF_OPER_UNKNOWN due to the asynchronous nature of linkwatch. Commit 22604c8 ('net: Fix for initial link state in 2.6.28') attempted to fix this by setting the operstate synchronously, but it was reverted as it could lead to deadlock. This initialises the operstate synchronously at registration time only. Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- net/core/dev.c | 2 ++ net/core/link_watch.c | 8 ++++++++ 2 files changed, 10 insertions(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index bc857fead8c8..2f25d0cac51c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5648,6 +5648,8 @@ int register_netdevice(struct net_device *dev) set_bit(__LINK_STATE_PRESENT, &dev->state); + linkwatch_init_dev(dev); + dev_init_scheduler(dev); dev_hold(dev); list_netdevice(dev); diff --git a/net/core/link_watch.c b/net/core/link_watch.c index c3519c6d1b16..a01922219a23 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -76,6 +76,14 @@ static void rfc2863_policy(struct net_device *dev) } +void linkwatch_init_dev(struct net_device *dev) +{ + /* Handle pre-registration link state changes */ + if (!netif_carrier_ok(dev) || netif_dormant(dev)) + rfc2863_policy(dev); +} + + static bool linkwatch_urgent_event(struct net_device *dev) { if (!netif_running(dev)) -- cgit v1.2.3 From 2cf545e835aae92173ef0b1f4af385e9c40f21e8 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Sun, 26 Aug 2012 19:14:10 +0200 Subject: net: core: add function for incremental IPv6 pseudo header checksum updates Add inet_proto_csum_replace16 for incrementally updating IPv6 pseudo header checksums for IPv6 NAT. Signed-off-by: Patrick McHardy Acked-by: David S. Miller --- net/core/utils.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'net/core') diff --git a/net/core/utils.c b/net/core/utils.c index 39895a65e54a..f5613d569c23 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -294,6 +294,26 @@ void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb, } EXPORT_SYMBOL(inet_proto_csum_replace4); +void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb, + const __be32 *from, const __be32 *to, + int pseudohdr) +{ + __be32 diff[] = { + ~from[0], ~from[1], ~from[2], ~from[3], + to[0], to[1], to[2], to[3], + }; + if (skb->ip_summed != CHECKSUM_PARTIAL) { + *sum = csum_fold(csum_partial(diff, sizeof(diff), + ~csum_unfold(*sum))); + if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr) + skb->csum = ~csum_partial(diff, sizeof(diff), + ~skb->csum); + } else if (pseudohdr) + *sum = ~csum_fold(csum_partial(diff, sizeof(diff), + csum_unfold(*sum))); +} +EXPORT_SYMBOL(inet_proto_csum_replace16); + int mac_pton(const char *s, u8 *mac) { int i; -- cgit v1.2.3 From 58a317f1061c894d2344c0b6a18ab4a64b69b815 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Sun, 26 Aug 2012 19:14:12 +0200 Subject: netfilter: ipv6: add IPv6 NAT support Signed-off-by: Patrick McHardy --- net/core/secure_seq.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core') diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c index 99b2596531bb..e61a8bb7fce7 100644 --- a/net/core/secure_seq.c +++ b/net/core/secure_seq.c @@ -76,6 +76,7 @@ u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr, return hash[0]; } +EXPORT_SYMBOL(secure_ipv6_port_ephemeral); #endif #ifdef CONFIG_INET -- cgit v1.2.3 From 6549dd43c04327071571edf7aea4465b16539422 Mon Sep 17 00:00:00 2001 From: Gao feng Date: Thu, 23 Aug 2012 15:36:55 +0000 Subject: net: dev: fix the incorrect hold of net namespace's lo device When moving a net device from one net namespace to another net namespace,dev_change_net_namespace calls NETDEV_DOWN event,so the original net namespace's dst entries which beloned to this net device will be put into dst_garbage list. then dev_change_net_namespace will set this net device's net to the new net namespace. If we unregister this net device's driver, this will trigger the NETDEV_UNREGISTER_FINAL event, dst_ifdown will be called, and get this net device's dst entries from dst_garbage list, put these entries' dev to the new net namespace's lo device. It's not what we want,actually we need these dst entries hold the original net namespace's lo device,this incorrect device holding will trigger emg message like below. unregister_netdevice: waiting for lo to become free. Usage count = 1 so we should call NETDEV_UNREGISTER_FINAL event in dev_change_net_namespace too,in order to make sure dst entries already in the dst_garbage list, we need rcu_barrier before we call NETDEV_UNREGISTER_FINAL event. With help form Eric Dumazet. Signed-off-by: Gao feng Cc: Eric Dumazet Cc: "Eric W. Biederman" Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 3401e2dab7cc..a5fc3e301cf2 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6259,6 +6259,8 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char the device is just moving and can keep their slaves up. */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); + rcu_barrier(); + call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); rtmsg_ifinfo(RTM_DELLINK, dev, ~0U); /* -- cgit v1.2.3 From ee13040901c11b016d996ab5a946acaaa3461737 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 24 Aug 2012 01:47:26 +0000 Subject: netpoll: provide an IP ident in UDP frames Let's fill IP header ident field with a meaningful value, it might help some setups. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/netpoll.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 346b1eb83a1f..5af9c2692506 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -388,6 +388,7 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len) struct udphdr *udph; struct iphdr *iph; struct ethhdr *eth; + static atomic_t ip_ident; udp_len = len + sizeof(*udph); ip_len = udp_len + sizeof(*iph); @@ -423,7 +424,7 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len) put_unaligned(0x45, (unsigned char *)iph); iph->tos = 0; put_unaligned(htons(ip_len), &(iph->tot_len)); - iph->id = 0; + iph->id = htons(atomic_inc_return(&ip_ident)); iph->frag_off = 0; iph->ttl = 64; iph->protocol = IPPROTO_UDP; -- cgit v1.2.3 From d1a53dfd114a6c53464de116cdab88d934bfe92f Mon Sep 17 00:00:00 2001 From: Rami Rosen Date: Mon, 27 Aug 2012 23:39:24 +0000 Subject: net: fix documentation of skb_needs_linearize(). skb_needs_linearize() does not check highmem DMA as it does not call illegal_highdma() anymore, so there is no need to mention highmem DMA here. (Indeed, ~NETIF_F_SG flag, which is checked in skb_needs_linearize(), can be set when illegal_highdma() returns true, and we are assured that illegal_highdma() is invoked prior to skb_needs_linearize() as skb_needs_linearize() is a static method called only once. But ~NETIF_F_SG can be set not only there in this same invocation path. It can also be set when can_checksum_protocol() returns false). see commit 02932ce9e2c136e6fab2571c8e0dd69ae8ec9853, Convert skb_need_linearize() to use precomputed features. Signed-off-by: Rami Rosen Signed-off-by: David S. Miller --- net/core/dev.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index a5fc3e301cf2..b1e6d6385516 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2184,9 +2184,7 @@ EXPORT_SYMBOL(netif_skb_features); /* * Returns true if either: * 1. skb has frag_list and the device doesn't support FRAGLIST, or - * 2. skb is fragmented and the device does not support SG, or if - * at least one of fragments is in highmem and device does not - * support DMA from it. + * 2. skb is fragmented and the device does not support SG. */ static inline int skb_needs_linearize(struct sk_buff *skb, int features) -- cgit v1.2.3 From 8336886f786fdacbc19b719c1f7ea91eb70706d4 Mon Sep 17 00:00:00 2001 From: Jerry Chu Date: Fri, 31 Aug 2012 12:29:12 +0000 Subject: tcp: TCP Fast Open Server - support TFO listeners This patch builds on top of the previous patch to add the support for TFO listeners. This includes - 1. allocating, properly initializing, and managing the per listener fastopen_queue structure when TFO is enabled 2. changes to the inet_csk_accept code to support TFO. E.g., the request_sock can no longer be freed upon accept(), not until 3WHS finishes 3. allowing a TCP_SYN_RECV socket to properly poll() and sendmsg() if it's a TFO socket 4. properly closing a TFO listener, and a TFO socket before 3WHS finishes 5. supporting TCP_FASTOPEN socket option 6. modifying tcp_check_req() to use to check a TFO socket as well as request_sock 7. supporting TCP's TFO cookie option 8. adding a new SYN-ACK retransmit handler to use the timer directly off the TFO socket rather than the listener socket. Note that TFO server side will not retransmit anything other than SYN-ACK until the 3WHS is completed. The patch also contains an important function "reqsk_fastopen_remove()" to manage the somewhat complex relation between a listener, its request_sock, and the corresponding child socket. See the comment above the function for the detail. Signed-off-by: H.K. Jerry Chu Cc: Yuchung Cheng Cc: Neal Cardwell Cc: Eric Dumazet Cc: Tom Herbert Signed-off-by: David S. Miller --- net/core/request_sock.c | 95 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) (limited to 'net/core') diff --git a/net/core/request_sock.c b/net/core/request_sock.c index 9b570a6a33c5..c31d9e8668c3 100644 --- a/net/core/request_sock.c +++ b/net/core/request_sock.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -130,3 +131,97 @@ void reqsk_queue_destroy(struct request_sock_queue *queue) kfree(lopt); } +/* + * This function is called to set a Fast Open socket's "fastopen_rsk" field + * to NULL when a TFO socket no longer needs to access the request_sock. + * This happens only after 3WHS has been either completed or aborted (e.g., + * RST is received). + * + * Before TFO, a child socket is created only after 3WHS is completed, + * hence it never needs to access the request_sock. things get a lot more + * complex with TFO. A child socket, accepted or not, has to access its + * request_sock for 3WHS processing, e.g., to retransmit SYN-ACK pkts, + * until 3WHS is either completed or aborted. Afterwards the req will stay + * until either the child socket is accepted, or in the rare case when the + * listener is closed before the child is accepted. + * + * In short, a request socket is only freed after BOTH 3WHS has completed + * (or aborted) and the child socket has been accepted (or listener closed). + * When a child socket is accepted, its corresponding req->sk is set to + * NULL since it's no longer needed. More importantly, "req->sk == NULL" + * will be used by the code below to determine if a child socket has been + * accepted or not, and the check is protected by the fastopenq->lock + * described below. + * + * Note that fastopen_rsk is only accessed from the child socket's context + * with its socket lock held. But a request_sock (req) can be accessed by + * both its child socket through fastopen_rsk, and a listener socket through + * icsk_accept_queue.rskq_accept_head. To protect the access a simple spin + * lock per listener "icsk->icsk_accept_queue.fastopenq->lock" is created. + * only in the rare case when both the listener and the child locks are held, + * e.g., in inet_csk_listen_stop() do we not need to acquire the lock. + * The lock also protects other fields such as fastopenq->qlen, which is + * decremented by this function when fastopen_rsk is no longer needed. + * + * Note that another solution was to simply use the existing socket lock + * from the listener. But first socket lock is difficult to use. It is not + * a simple spin lock - one must consider sock_owned_by_user() and arrange + * to use sk_add_backlog() stuff. But what really makes it infeasible is the + * locking hierarchy violation. E.g., inet_csk_listen_stop() may try to + * acquire a child's lock while holding listener's socket lock. A corner + * case might also exist in tcp_v4_hnd_req() that will trigger this locking + * order. + * + * When a TFO req is created, it needs to sock_hold its listener to prevent + * the latter data structure from going away. + * + * This function also sets "treq->listener" to NULL and unreference listener + * socket. treq->listener is used by the listener so it is protected by the + * fastopenq->lock in this function. + */ +void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req, + bool reset) +{ + struct sock *lsk = tcp_rsk(req)->listener; + struct fastopen_queue *fastopenq = + inet_csk(lsk)->icsk_accept_queue.fastopenq; + + BUG_ON(!spin_is_locked(&sk->sk_lock.slock) && !sock_owned_by_user(sk)); + + tcp_sk(sk)->fastopen_rsk = NULL; + spin_lock_bh(&fastopenq->lock); + fastopenq->qlen--; + tcp_rsk(req)->listener = NULL; + if (req->sk) /* the child socket hasn't been accepted yet */ + goto out; + + if (!reset || lsk->sk_state != TCP_LISTEN) { + /* If the listener has been closed don't bother with the + * special RST handling below. + */ + spin_unlock_bh(&fastopenq->lock); + sock_put(lsk); + reqsk_free(req); + return; + } + /* Wait for 60secs before removing a req that has triggered RST. + * This is a simple defense against TFO spoofing attack - by + * counting the req against fastopen.max_qlen, and disabling + * TFO when the qlen exceeds max_qlen. + * + * For more details see CoNext'11 "TCP Fast Open" paper. + */ + req->expires = jiffies + 60*HZ; + if (fastopenq->rskq_rst_head == NULL) + fastopenq->rskq_rst_head = req; + else + fastopenq->rskq_rst_tail->dl_next = req; + + req->dl_next = NULL; + fastopenq->rskq_rst_tail = req; + fastopenq->qlen++; +out: + spin_unlock_bh(&fastopenq->lock); + sock_put(lsk); + return; +} -- cgit v1.2.3 From c6c13965f4ffca6163af0c9419095aedc103648c Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Wed, 5 Sep 2012 04:11:28 +0000 Subject: net: add unknown state to sysfs NIC duplex export Currently when the NIC duplex state is DUPLEX_UNKNOWN it is exported as full through sysfs, this patch adds support for DUPLEX_UNKNOWN. It is handled the same way as in ethtool. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 72607174ea5a..bcf02f608cbf 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -166,9 +166,21 @@ static ssize_t show_duplex(struct device *dev, if (netif_running(netdev)) { struct ethtool_cmd cmd; - if (!__ethtool_get_settings(netdev, &cmd)) - ret = sprintf(buf, "%s\n", - cmd.duplex ? "full" : "half"); + if (!__ethtool_get_settings(netdev, &cmd)) { + const char *duplex; + switch (cmd.duplex) { + case DUPLEX_HALF: + duplex = "half"; + break; + case DUPLEX_FULL: + duplex = "full"; + break; + default: + duplex = "unknown"; + break; + } + ret = sprintf(buf, "%s\n", duplex); + } } rtnl_unlock(); return ret; -- cgit v1.2.3 From dbe9a4173ea53b72b2c35d19f676a85b69f1c9fe Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 6 Sep 2012 18:20:01 +0000 Subject: scm: Don't use struct ucred in NETLINK_CB and struct scm_cookie. Passing uids and gids on NETLINK_CB from a process in one user namespace to a process in another user namespace can result in the wrong uid or gid being presented to userspace. Avoid that problem by passing kuids and kgids instead. - define struct scm_creds for use in scm_cookie and netlink_skb_parms that holds uid and gid information in kuid_t and kgid_t. - Modify scm_set_cred to fill out scm_creds by heand instead of using cred_to_ucred to fill out struct ucred. This conversion ensures userspace does not get incorrect uid or gid values to look at. - Modify scm_recv to convert from struct scm_creds to struct ucred before copying credential values to userspace. - Modify __scm_send to populate struct scm_creds on in the scm_cookie, instead of just copying struct ucred from userspace. - Modify netlink_sendmsg to copy scm_creds instead of struct ucred into the NETLINK_CB. Signed-off-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- net/core/scm.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'net/core') diff --git a/net/core/scm.c b/net/core/scm.c index 6ab491d6c26f..9c1c63da3ca8 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -155,19 +155,21 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p) break; case SCM_CREDENTIALS: { + struct ucred creds; kuid_t uid; kgid_t gid; if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct ucred))) goto error; - memcpy(&p->creds, CMSG_DATA(cmsg), sizeof(struct ucred)); - err = scm_check_creds(&p->creds); + memcpy(&creds, CMSG_DATA(cmsg), sizeof(struct ucred)); + err = scm_check_creds(&creds); if (err) goto error; - if (!p->pid || pid_vnr(p->pid) != p->creds.pid) { + p->creds.pid = creds.pid; + if (!p->pid || pid_vnr(p->pid) != creds.pid) { struct pid *pid; err = -ESRCH; - pid = find_get_pid(p->creds.pid); + pid = find_get_pid(creds.pid); if (!pid) goto error; put_pid(p->pid); @@ -175,11 +177,14 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p) } err = -EINVAL; - uid = make_kuid(current_user_ns(), p->creds.uid); - gid = make_kgid(current_user_ns(), p->creds.gid); + uid = make_kuid(current_user_ns(), creds.uid); + gid = make_kgid(current_user_ns(), creds.gid); if (!uid_valid(uid) || !gid_valid(gid)) goto error; + p->creds.uid = uid; + p->creds.gid = gid; + if (!p->cred || !uid_eq(p->cred->euid, uid) || !gid_eq(p->cred->egid, gid)) { -- cgit v1.2.3 From 9785e10aedfa0fad5c1aac709dce5ada1b123783 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sat, 8 Sep 2012 02:53:53 +0000 Subject: netlink: kill netlink_set_nonroot Replace netlink_set_nonroot by one new field `flags' in struct netlink_kernel_cfg that is passed to netlink_kernel_create. This patch also renames NL_NONROOT_* to NL_CFG_F_NONROOT_* since now the flags field in nl_table is generic (so we can add more flags if needed in the future). Also adjust all callers in the net-next tree to use these flags instead of netlink_set_nonroot. Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index c64efcff8078..a71806eb9cc6 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2381,6 +2381,7 @@ static int __net_init rtnetlink_net_init(struct net *net) .groups = RTNLGRP_MAX, .input = rtnetlink_rcv, .cb_mutex = &rtnl_mutex, + .flags = NL_CFG_F_NONROOT_RECV, }; sk = netlink_kernel_create(net, NETLINK_ROUTE, THIS_MODULE, &cfg); @@ -2416,7 +2417,6 @@ void __init rtnetlink_init(void) if (register_pernet_subsys(&rtnetlink_net_ops)) panic("rtnetlink_init: cannot initialize rtnetlink\n"); - netlink_set_nonroot(NETLINK_ROUTE, NL_NONROOT_RECV); register_netdevice_notifier(&rtnetlink_dev_notifier); rtnl_register(PF_UNSPEC, RTM_GETLINK, rtnl_getlink, -- cgit v1.2.3 From 9f00d9776bc5beb92e8bfc884a7e96ddc5589e2e Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sat, 8 Sep 2012 02:53:54 +0000 Subject: netlink: hide struct module parameter in netlink_kernel_create This patch defines netlink_kernel_create as a wrapper function of __netlink_kernel_create to hide the struct module *me parameter (which seems to be THIS_MODULE in all existing netlink subsystems). Suggested by David S. Miller. Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 2 +- net/core/sock_diag.c | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index a71806eb9cc6..508c5df4a09c 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2384,7 +2384,7 @@ static int __net_init rtnetlink_net_init(struct net *net) .flags = NL_CFG_F_NONROOT_RECV, }; - sk = netlink_kernel_create(net, NETLINK_ROUTE, THIS_MODULE, &cfg); + sk = netlink_kernel_create(net, NETLINK_ROUTE, &cfg); if (!sk) return -ENOMEM; net->rtnl = sk; diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index 9d8755e4a7a5..602cd637182e 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -172,8 +172,7 @@ static int __net_init diag_net_init(struct net *net) .input = sock_diag_rcv, }; - net->diag_nlsk = netlink_kernel_create(net, NETLINK_SOCK_DIAG, - THIS_MODULE, &cfg); + net->diag_nlsk = netlink_kernel_create(net, NETLINK_SOCK_DIAG, &cfg); return net->diag_nlsk == NULL ? -ENOMEM : 0; } -- cgit v1.2.3 From 15e473046cb6e5d18a4d0057e61d76315230382b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 7 Sep 2012 20:12:54 +0000 Subject: netlink: Rename pid to portid to avoid confusion It is a frequent mistake to confuse the netlink port identifier with a process identifier. Try to reduce this confusion by renaming fields that hold port identifiers portid instead of pid. I have carefully avoided changing the structures exported to userspace to avoid changing the userspace API. I have successfully built an allyesconfig kernel with this change. Signed-off-by: "Eric W. Biederman" Acked-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/core/fib_rules.c | 6 +++--- net/core/neighbour.c | 8 ++++---- net/core/rtnetlink.c | 12 ++++++------ 3 files changed, 13 insertions(+), 13 deletions(-) (limited to 'net/core') diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index ab7db83236c9..58a4ba27dfe3 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -402,7 +402,7 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) if (unresolved) ops->unresolved_rules++; - notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).pid); + notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).portid); flush_route_cache(ops); rules_ops_put(ops); return 0; @@ -500,7 +500,7 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) } notify_rule_change(RTM_DELRULE, rule, ops, nlh, - NETLINK_CB(skb).pid); + NETLINK_CB(skb).portid); if (ops->delete) ops->delete(rule); fib_rule_put(rule); @@ -601,7 +601,7 @@ static int dump_rules(struct sk_buff *skb, struct netlink_callback *cb, if (idx < cb->args[1]) goto skip; - if (fib_nl_fill_rule(skb, rule, NETLINK_CB(cb->skb).pid, + if (fib_nl_fill_rule(skb, rule, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWRULE, NLM_F_MULTI, ops) < 0) break; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 117afaf51268..c160adb38e5a 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -2102,7 +2102,7 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb) if (tidx < tbl_skip || (family && tbl->family != family)) continue; - if (neightbl_fill_info(skb, tbl, NETLINK_CB(cb->skb).pid, + if (neightbl_fill_info(skb, tbl, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWNEIGHTBL, NLM_F_MULTI) <= 0) break; @@ -2115,7 +2115,7 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb) goto next; if (neightbl_fill_param_info(skb, tbl, p, - NETLINK_CB(cb->skb).pid, + NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWNEIGHTBL, NLM_F_MULTI) <= 0) @@ -2244,7 +2244,7 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, continue; if (idx < s_idx) goto next; - if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).pid, + if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWNEIGH, NLM_F_MULTI) <= 0) { @@ -2281,7 +2281,7 @@ static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, continue; if (idx < s_idx) goto next; - if (pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).pid, + if (pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWNEIGH, NLM_F_MULTI, tbl) <= 0) { diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 508c5df4a09c..92575370d9f0 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1081,7 +1081,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) if (idx < s_idx) goto cont; if (rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK, - NETLINK_CB(cb->skb).pid, + NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, 0, NLM_F_MULTI, ext_filter_mask) <= 0) @@ -1899,14 +1899,14 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) if (nskb == NULL) return -ENOBUFS; - err = rtnl_fill_ifinfo(nskb, dev, RTM_NEWLINK, NETLINK_CB(skb).pid, + err = rtnl_fill_ifinfo(nskb, dev, RTM_NEWLINK, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, 0, ext_filter_mask); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_size */ WARN_ON(err == -EMSGSIZE); kfree_skb(nskb); } else - err = rtnl_unicast(nskb, net, NETLINK_CB(skb).pid); + err = rtnl_unicast(nskb, net, NETLINK_CB(skb).portid); return err; } @@ -2180,9 +2180,9 @@ static int nlmsg_populate_fdb(struct sk_buff *skb, { struct netdev_hw_addr *ha; int err; - u32 pid, seq; + u32 portid, seq; - pid = NETLINK_CB(cb->skb).pid; + portid = NETLINK_CB(cb->skb).portid; seq = cb->nlh->nlmsg_seq; list_for_each_entry(ha, &list->list, list) { @@ -2190,7 +2190,7 @@ static int nlmsg_populate_fdb(struct sk_buff *skb, goto skip; err = nlmsg_populate_fdb_fill(skb, dev, ha->addr, - pid, seq, 0, NTF_SELF); + portid, seq, 0, NTF_SELF); if (err < 0) return err; skip: -- cgit v1.2.3 From b6069a95706ca5738be3f5d90fd286cbd13ac695 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 7 Sep 2012 22:03:35 +0000 Subject: filter: add MOD operation Add a new ALU opcode, to compute a modulus. Commit ffe06c17afbbb used an ancillary to implement XOR_X, but here we reserve one of the available ALU opcode to implement both MOD_X and MOD_K Signed-off-by: Eric Dumazet Suggested-by: George Bakos Cc: Jay Schulist Cc: Jiri Pirko Cc: Andi Kleen Signed-off-by: David S. Miller --- net/core/filter.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 907efd27ec77..fbe3a8d12570 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -167,6 +167,14 @@ unsigned int sk_run_filter(const struct sk_buff *skb, case BPF_S_ALU_DIV_K: A = reciprocal_divide(A, K); continue; + case BPF_S_ALU_MOD_X: + if (X == 0) + return 0; + A %= X; + continue; + case BPF_S_ALU_MOD_K: + A %= K; + continue; case BPF_S_ALU_AND_X: A &= X; continue; @@ -469,6 +477,8 @@ int sk_chk_filter(struct sock_filter *filter, unsigned int flen) [BPF_ALU|BPF_MUL|BPF_K] = BPF_S_ALU_MUL_K, [BPF_ALU|BPF_MUL|BPF_X] = BPF_S_ALU_MUL_X, [BPF_ALU|BPF_DIV|BPF_X] = BPF_S_ALU_DIV_X, + [BPF_ALU|BPF_MOD|BPF_K] = BPF_S_ALU_MOD_K, + [BPF_ALU|BPF_MOD|BPF_X] = BPF_S_ALU_MOD_X, [BPF_ALU|BPF_AND|BPF_K] = BPF_S_ALU_AND_K, [BPF_ALU|BPF_AND|BPF_X] = BPF_S_ALU_AND_X, [BPF_ALU|BPF_OR|BPF_K] = BPF_S_ALU_OR_K, @@ -531,6 +541,11 @@ int sk_chk_filter(struct sock_filter *filter, unsigned int flen) return -EINVAL; ftest->k = reciprocal_value(ftest->k); break; + case BPF_S_ALU_MOD_K: + /* check for division by zero */ + if (ftest->k == 0) + return -EINVAL; + break; case BPF_S_LD_MEM: case BPF_S_LDX_MEM: case BPF_S_ST: -- cgit v1.2.3 From d530d6df96ee28902486f8e11815ef9ad3a1cd1b Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Wed, 12 Sep 2012 20:32:25 +0000 Subject: netprio_cgroup: Remove update_netdev_tables() since it is unnecessary The update_netdev_tables() function appears to be unnecessary, since the write_update_netdev_table() function will adjust the priomaps as and when required anyway. So drop the usage of update_netdev_tables() entirely. Signed-off-by: Srivatsa S. Bhat Signed-off-by: David S. Miller --- net/core/netprio_cgroup.c | 32 -------------------------------- 1 file changed, 32 deletions(-) (limited to 'net/core') diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index c75e3f9d060f..fd339bb00106 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -109,32 +109,6 @@ static int write_update_netdev_table(struct net_device *dev) return ret; } -static int update_netdev_tables(void) -{ - int ret = 0; - struct net_device *dev; - u32 max_len; - struct netprio_map *map; - - rtnl_lock(); - max_len = atomic_read(&max_prioidx) + 1; - for_each_netdev(&init_net, dev) { - map = rtnl_dereference(dev->priomap); - /* - * don't allocate priomap if we didn't - * change net_prio.ifpriomap (map == NULL), - * this will speed up skb_update_prio. - */ - if (map && map->priomap_len < max_len) { - ret = extend_netdev_table(dev, max_len); - if (ret < 0) - break; - } - } - rtnl_unlock(); - return ret; -} - static struct cgroup_subsys_state *cgrp_create(struct cgroup *cgrp) { struct cgroup_netprio_state *cs; @@ -153,12 +127,6 @@ static struct cgroup_subsys_state *cgrp_create(struct cgroup *cgrp) goto out; } - ret = update_netdev_tables(); - if (ret < 0) { - put_prioidx(cs->prioidx); - goto out; - } - return &cs->css; out: kfree(cs); -- cgit v1.2.3 From f05ba7fccf0c5f0422378adaffcb119d08b9f304 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Wed, 12 Sep 2012 20:32:34 +0000 Subject: netprio_cgroup: Use memcpy instead of the for-loop to copy priomap Replace the current (inefficient) for-loop with memcpy, to copy priomap. Signed-off-by: Srivatsa S. Bhat Signed-off-by: David S. Miller --- net/core/netprio_cgroup.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index fd339bb00106..45c503e45fc6 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -73,7 +73,6 @@ static int extend_netdev_table(struct net_device *dev, u32 new_len) ((sizeof(u32) * new_len)); struct netprio_map *new_priomap = kzalloc(new_size, GFP_KERNEL); struct netprio_map *old_priomap; - int i; old_priomap = rtnl_dereference(dev->priomap); @@ -82,10 +81,10 @@ static int extend_netdev_table(struct net_device *dev, u32 new_len) return -ENOMEM; } - for (i = 0; - old_priomap && (i < old_priomap->priomap_len); - i++) - new_priomap->priomap[i] = old_priomap->priomap[i]; + if (old_priomap) + memcpy(new_priomap->priomap, old_priomap->priomap, + old_priomap->priomap_len * + sizeof(old_priomap->priomap[0])); new_priomap->priomap_len = new_len; -- cgit v1.2.3 From b40863c667c16b7a73d4f034a8eab67029b5b15a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 18 Sep 2012 20:44:49 +0000 Subject: net: more accurate network taps in transmit path dev_queue_xmit_nit() should be called right before ndo_start_xmit() calls or we might give wrong packet contents to taps users : Packet checksum can be changed, or packet can be linearized or segmented, and segments partially sent for the later case. Also a memory allocation can fail and packet never really hit the driver entry point. Reported-by: Jamie Gloudon Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index dcc673d0674c..52cd1d7f004a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2213,9 +2213,6 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, if (dev->priv_flags & IFF_XMIT_DST_RELEASE) skb_dst_drop(skb); - if (!list_empty(&ptype_all)) - dev_queue_xmit_nit(skb, dev); - features = netif_skb_features(skb); if (vlan_tx_tag_present(skb) && @@ -2250,6 +2247,9 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, } } + if (!list_empty(&ptype_all)) + dev_queue_xmit_nit(skb, dev); + skb_len = skb->len; rc = ops->ndo_start_xmit(skb, dev); trace_net_dev_xmit(skb, rc, dev, skb_len); @@ -2272,6 +2272,9 @@ gso: if (dev->priv_flags & IFF_XMIT_DST_RELEASE) skb_dst_drop(nskb); + if (!list_empty(&ptype_all)) + dev_queue_xmit_nit(nskb, dev); + skb_len = nskb->len; rc = ops->ndo_start_xmit(nskb, dev); trace_net_dev_xmit(nskb, rc, dev, skb_len); -- cgit v1.2.3 From 828de4f6bf6e785f7b5497f8f7cfd4d4fbcfdb7e Mon Sep 17 00:00:00 2001 From: Gao feng Date: Thu, 13 Sep 2012 20:58:27 +0000 Subject: net: dev: fix incorrect getting net device's name When moving a nic from net namespace A to net namespace B, in dev_change_net_namesapce,we call __dev_get_by_name to decide if the netns B has the device has the same name. if the netns B already has the same named device,we call dev_get_valid_name to try to get a valid name for this nic in the netns B,but net_device->nd_net still point to netns A now. this patch fix it. Signed-off-by: Gao feng Signed-off-by: David S. Miller --- net/core/dev.c | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 52cd1d7f004a..bb42eb161969 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -959,18 +959,30 @@ int dev_alloc_name(struct net_device *dev, const char *name) } EXPORT_SYMBOL(dev_alloc_name); -static int dev_get_valid_name(struct net_device *dev, const char *name) +static int dev_alloc_name_ns(struct net *net, + struct net_device *dev, + const char *name) { - struct net *net; + char buf[IFNAMSIZ]; + int ret; - BUG_ON(!dev_net(dev)); - net = dev_net(dev); + ret = __dev_alloc_name(net, name, buf); + if (ret >= 0) + strlcpy(dev->name, buf, IFNAMSIZ); + return ret; +} + +static int dev_get_valid_name(struct net *net, + struct net_device *dev, + const char *name) +{ + BUG_ON(!net); if (!dev_valid_name(name)) return -EINVAL; if (strchr(name, '%')) - return dev_alloc_name(dev, name); + return dev_alloc_name_ns(net, dev, name); else if (__dev_get_by_name(net, name)) return -EEXIST; else if (dev->name != name) @@ -1006,7 +1018,7 @@ int dev_change_name(struct net_device *dev, const char *newname) memcpy(oldname, dev->name, IFNAMSIZ); - err = dev_get_valid_name(dev, newname); + err = dev_get_valid_name(net, dev, newname); if (err < 0) return err; @@ -5589,7 +5601,7 @@ int register_netdevice(struct net_device *dev) dev->iflink = -1; - ret = dev_get_valid_name(dev, dev->name); + ret = dev_get_valid_name(net, dev, dev->name); if (ret < 0) goto out; @@ -6233,7 +6245,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char /* We get here if we can't use the current device name */ if (!pat) goto out; - if (dev_get_valid_name(dev, pat) < 0) + if (dev_get_valid_name(net, dev, pat) < 0) goto out; } -- cgit v1.2.3 From 2c60db037034d27f8c636403355d52872da92f81 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 16 Sep 2012 09:17:26 +0000 Subject: net: provide a default dev->ethtool_ops MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of forcing device drivers to provide empty ethtool_ops or tweak net/core/ethtool.c again, we could provide a generic ethtool_ops. This occurred to me when I wanted to add GSO support to GRE tunnels. ethtool -k support should be generic for all drivers. Signed-off-by: Eric Dumazet Cc: Ben Hutchings Cc: Maciej Żenczykowski Reviewed-by: Ben Hutchings Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++++ net/core/ethtool.c | 12 ------------ 2 files changed, 4 insertions(+), 12 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index bb42eb161969..bbda81997f4f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5974,6 +5974,8 @@ struct netdev_queue *dev_ingress_queue_create(struct net_device *dev) return queue; } +static const struct ethtool_ops default_ethtool_ops; + /** * alloc_netdev_mqs - allocate network device * @sizeof_priv: size of private data to allocate space for @@ -6061,6 +6063,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, strcpy(dev->name, name); dev->group = INIT_NETDEV_GROUP; + if (!dev->ethtool_ops) + dev->ethtool_ops = &default_ethtool_ops; return dev; free_all: diff --git a/net/core/ethtool.c b/net/core/ethtool.c index cbf033dcaf1f..4d64cc2e3fa9 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -1426,18 +1426,6 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) if (copy_from_user(ðcmd, useraddr, sizeof(ethcmd))) return -EFAULT; - if (!dev->ethtool_ops) { - /* A few commands do not require any driver support, - * are unprivileged, and do not change anything, so we - * can take a shortcut to them. */ - if (ethcmd == ETHTOOL_GDRVINFO) - return ethtool_get_drvinfo(dev, useraddr); - else if (ethcmd == ETHTOOL_GET_TS_INFO) - return ethtool_get_ts_info(dev, useraddr); - else - return -EOPNOTSUPP; - } - /* Allow some commands to be done by anyone */ switch (ethcmd) { case ETHTOOL_GSET: -- cgit v1.2.3 From 6b6e27255f29a6191ef8ad96bfcc392ab2ef6c71 Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Mon, 17 Sep 2012 10:03:26 +0000 Subject: netdev: make address const in device address management The internal functions for add/deleting addresses don't change their argument. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/core/dev_addr_lists.c | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) (limited to 'net/core') diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index c4cc2bc49f06..87cc17db2d56 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -22,7 +22,7 @@ */ static int __hw_addr_create_ex(struct netdev_hw_addr_list *list, - unsigned char *addr, int addr_len, + const unsigned char *addr, int addr_len, unsigned char addr_type, bool global) { struct netdev_hw_addr *ha; @@ -46,7 +46,7 @@ static int __hw_addr_create_ex(struct netdev_hw_addr_list *list, } static int __hw_addr_add_ex(struct netdev_hw_addr_list *list, - unsigned char *addr, int addr_len, + const unsigned char *addr, int addr_len, unsigned char addr_type, bool global) { struct netdev_hw_addr *ha; @@ -72,14 +72,15 @@ static int __hw_addr_add_ex(struct netdev_hw_addr_list *list, return __hw_addr_create_ex(list, addr, addr_len, addr_type, global); } -static int __hw_addr_add(struct netdev_hw_addr_list *list, unsigned char *addr, - int addr_len, unsigned char addr_type) +static int __hw_addr_add(struct netdev_hw_addr_list *list, + const unsigned char *addr, int addr_len, + unsigned char addr_type) { return __hw_addr_add_ex(list, addr, addr_len, addr_type, false); } static int __hw_addr_del_ex(struct netdev_hw_addr_list *list, - unsigned char *addr, int addr_len, + const unsigned char *addr, int addr_len, unsigned char addr_type, bool global) { struct netdev_hw_addr *ha; @@ -104,8 +105,9 @@ static int __hw_addr_del_ex(struct netdev_hw_addr_list *list, return -ENOENT; } -static int __hw_addr_del(struct netdev_hw_addr_list *list, unsigned char *addr, - int addr_len, unsigned char addr_type) +static int __hw_addr_del(struct netdev_hw_addr_list *list, + const unsigned char *addr, int addr_len, + unsigned char addr_type) { return __hw_addr_del_ex(list, addr, addr_len, addr_type, false); } @@ -278,7 +280,7 @@ EXPORT_SYMBOL(dev_addr_init); * * The caller must hold the rtnl_mutex. */ -int dev_addr_add(struct net_device *dev, unsigned char *addr, +int dev_addr_add(struct net_device *dev, const unsigned char *addr, unsigned char addr_type) { int err; @@ -303,7 +305,7 @@ EXPORT_SYMBOL(dev_addr_add); * * The caller must hold the rtnl_mutex. */ -int dev_addr_del(struct net_device *dev, unsigned char *addr, +int dev_addr_del(struct net_device *dev, const unsigned char *addr, unsigned char addr_type) { int err; @@ -390,7 +392,7 @@ EXPORT_SYMBOL(dev_addr_del_multiple); * @dev: device * @addr: address to add */ -int dev_uc_add_excl(struct net_device *dev, unsigned char *addr) +int dev_uc_add_excl(struct net_device *dev, const unsigned char *addr) { struct netdev_hw_addr *ha; int err; @@ -421,7 +423,7 @@ EXPORT_SYMBOL(dev_uc_add_excl); * Add a secondary unicast address to the device or increase * the reference count if it already exists. */ -int dev_uc_add(struct net_device *dev, unsigned char *addr) +int dev_uc_add(struct net_device *dev, const unsigned char *addr) { int err; @@ -443,7 +445,7 @@ EXPORT_SYMBOL(dev_uc_add); * Release reference to a secondary unicast address and remove it * from the device if the reference count drops to zero. */ -int dev_uc_del(struct net_device *dev, unsigned char *addr) +int dev_uc_del(struct net_device *dev, const unsigned char *addr) { int err; @@ -543,7 +545,7 @@ EXPORT_SYMBOL(dev_uc_init); * @dev: device * @addr: address to add */ -int dev_mc_add_excl(struct net_device *dev, unsigned char *addr) +int dev_mc_add_excl(struct net_device *dev, const unsigned char *addr) { struct netdev_hw_addr *ha; int err; @@ -566,7 +568,7 @@ out: } EXPORT_SYMBOL(dev_mc_add_excl); -static int __dev_mc_add(struct net_device *dev, unsigned char *addr, +static int __dev_mc_add(struct net_device *dev, const unsigned char *addr, bool global) { int err; @@ -587,7 +589,7 @@ static int __dev_mc_add(struct net_device *dev, unsigned char *addr, * Add a multicast address to the device or increase * the reference count if it already exists. */ -int dev_mc_add(struct net_device *dev, unsigned char *addr) +int dev_mc_add(struct net_device *dev, const unsigned char *addr) { return __dev_mc_add(dev, addr, false); } @@ -600,13 +602,13 @@ EXPORT_SYMBOL(dev_mc_add); * * Add a global multicast address to the device. */ -int dev_mc_add_global(struct net_device *dev, unsigned char *addr) +int dev_mc_add_global(struct net_device *dev, const unsigned char *addr) { return __dev_mc_add(dev, addr, true); } EXPORT_SYMBOL(dev_mc_add_global); -static int __dev_mc_del(struct net_device *dev, unsigned char *addr, +static int __dev_mc_del(struct net_device *dev, const unsigned char *addr, bool global) { int err; @@ -628,7 +630,7 @@ static int __dev_mc_del(struct net_device *dev, unsigned char *addr, * Release reference to a multicast address and remove it * from the device if the reference count drops to zero. */ -int dev_mc_del(struct net_device *dev, unsigned char *addr) +int dev_mc_del(struct net_device *dev, const unsigned char *addr) { return __dev_mc_del(dev, addr, false); } @@ -642,7 +644,7 @@ EXPORT_SYMBOL(dev_mc_del); * Release reference to a multicast address and remove it * from the device if the reference count drops to zero. */ -int dev_mc_del_global(struct net_device *dev, unsigned char *addr) +int dev_mc_del_global(struct net_device *dev, const unsigned char *addr) { return __dev_mc_del(dev, addr, true); } -- cgit v1.2.3 From 8c4c49df5cfeb8d56e5b85a430c8cbcb86c2ac37 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Mon, 17 Sep 2012 20:16:31 +0000 Subject: netpoll: call ->ndo_select_queue() in tx path In netpoll tx path, we miss the chance of calling ->ndo_select_queue(), thus could cause problems when bonding is involved. This patch makes dev_pick_tx() extern (and rename it to netdev_pick_tx()) to let netpoll call it in netpoll_send_skb_on_dev(). Reported-by: Sylvain Munaut Cc: "David S. Miller" Cc: Eric Dumazet Signed-off-by: Cong Wang Tested-by: Sylvain Munaut Signed-off-by: David S. Miller --- net/core/dev.c | 6 +++--- net/core/netpoll.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index bbda81997f4f..707b12425a79 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2396,8 +2396,8 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) #endif } -static struct netdev_queue *dev_pick_tx(struct net_device *dev, - struct sk_buff *skb) +struct netdev_queue *netdev_pick_tx(struct net_device *dev, + struct sk_buff *skb) { int queue_index; const struct net_device_ops *ops = dev->netdev_ops; @@ -2571,7 +2571,7 @@ int dev_queue_xmit(struct sk_buff *skb) skb_update_prio(skb); - txq = dev_pick_tx(dev, skb); + txq = netdev_pick_tx(dev, skb); q = rcu_dereference_bh(txq->qdisc); #ifdef CONFIG_NET_CLS_ACT diff --git a/net/core/netpoll.c b/net/core/netpoll.c index dd67818025d1..77a0388fc3be 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -328,7 +328,7 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, if (skb_queue_len(&npinfo->txq) == 0 && !netpoll_owner_active(dev)) { struct netdev_queue *txq; - txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); + txq = netdev_pick_tx(dev, skb); /* try until next clock tick */ for (tries = jiffies_to_usecs(1)/USEC_PER_POLL; -- cgit v1.2.3 From 5640f7685831e088fe6c2e1f863a6805962f8e81 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 23 Sep 2012 23:04:42 +0000 Subject: net: use a per task frag allocator We currently use a per socket order-0 page cache for tcp_sendmsg() operations. This page is used to build fragments for skbs. Its done to increase probability of coalescing small write() into single segments in skbs still in write queue (not yet sent) But it wastes a lot of memory for applications handling many mostly idle sockets, since each socket holds one page in sk->sk_sndmsg_page Its also quite inefficient to build TSO 64KB packets, because we need about 16 pages per skb on arches where PAGE_SIZE = 4096, so we hit page allocator more than wanted. This patch adds a per task frag allocator and uses bigger pages, if available. An automatic fallback is done in case of memory pressure. (up to 32768 bytes per frag, thats order-3 pages on x86) This increases TCP stream performance by 20% on loopback device, but also benefits on other network devices, since 8x less frags are mapped on transmit and unmapped on tx completion. Alexander Duyck mentioned a probable performance win on systems with IOMMU enabled. Its possible some SG enabled hardware cant cope with bigger fragments, but their ndo_start_xmit() should already handle this, splitting a fragment in sub fragments, since some arches have PAGE_SIZE=65536 Successfully tested on various ethernet devices. (ixgbe, igb, bnx2x, tg3, mellanox mlx4) Signed-off-by: Eric Dumazet Cc: Ben Hutchings Cc: Vijay Subramanian Cc: Alexander Duyck Tested-by: Vijay Subramanian Signed-off-by: David S. Miller --- net/core/skbuff.c | 37 +++++++++---------------------------- net/core/sock.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 56 insertions(+), 30 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index fe00d1208167..2ede3cfa8ffa 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1655,38 +1655,19 @@ static struct page *linear_to_page(struct page *page, unsigned int *len, unsigned int *offset, struct sk_buff *skb, struct sock *sk) { - struct page *p = sk->sk_sndmsg_page; - unsigned int off; + struct page_frag *pfrag = sk_page_frag(sk); - if (!p) { -new_page: - p = sk->sk_sndmsg_page = alloc_pages(sk->sk_allocation, 0); - if (!p) - return NULL; - - off = sk->sk_sndmsg_off = 0; - /* hold one ref to this page until it's full */ - } else { - unsigned int mlen; - - /* If we are the only user of the page, we can reset offset */ - if (page_count(p) == 1) - sk->sk_sndmsg_off = 0; - off = sk->sk_sndmsg_off; - mlen = PAGE_SIZE - off; - if (mlen < 64 && mlen < *len) { - put_page(p); - goto new_page; - } + if (!sk_page_frag_refill(sk, pfrag)) + return NULL; - *len = min_t(unsigned int, *len, mlen); - } + *len = min_t(unsigned int, *len, pfrag->size - pfrag->offset); - memcpy(page_address(p) + off, page_address(page) + *offset, *len); - sk->sk_sndmsg_off += *len; - *offset = off; + memcpy(page_address(pfrag->page) + pfrag->offset, + page_address(page) + *offset, *len); + *offset = pfrag->offset; + pfrag->offset += *len; - return p; + return pfrag->page; } static bool spd_can_coalesce(const struct splice_pipe_desc *spd, diff --git a/net/core/sock.c b/net/core/sock.c index 2693f7649222..727114cd6f7e 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1744,6 +1744,45 @@ struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, } EXPORT_SYMBOL(sock_alloc_send_skb); +/* On 32bit arches, an skb frag is limited to 2^15 */ +#define SKB_FRAG_PAGE_ORDER get_order(32768) + +bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) +{ + int order; + + if (pfrag->page) { + if (atomic_read(&pfrag->page->_count) == 1) { + pfrag->offset = 0; + return true; + } + if (pfrag->offset < pfrag->size) + return true; + put_page(pfrag->page); + } + + /* We restrict high order allocations to users that can afford to wait */ + order = (sk->sk_allocation & __GFP_WAIT) ? SKB_FRAG_PAGE_ORDER : 0; + + do { + gfp_t gfp = sk->sk_allocation; + + if (order) + gfp |= __GFP_COMP | __GFP_NOWARN; + pfrag->page = alloc_pages(gfp, order); + if (likely(pfrag->page)) { + pfrag->offset = 0; + pfrag->size = PAGE_SIZE << order; + return true; + } + } while (--order >= 0); + + sk_enter_memory_pressure(sk); + sk_stream_moderate_sndbuf(sk); + return false; +} +EXPORT_SYMBOL(sk_page_frag_refill); + static void __lock_sock(struct sock *sk) __releases(&sk->sk_lock.slock) __acquires(&sk->sk_lock.slock) @@ -2173,8 +2212,8 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_error_report = sock_def_error_report; sk->sk_destruct = sock_def_destruct; - sk->sk_sndmsg_page = NULL; - sk->sk_sndmsg_off = 0; + sk->sk_frag.page = NULL; + sk->sk_frag.offset = 0; sk->sk_peek_off = -1; sk->sk_peer_pid = NULL; @@ -2417,6 +2456,12 @@ void sk_common_release(struct sock *sk) xfrm_sk_free_policy(sk); sk_refcnt_debug_release(sk); + + if (sk->sk_frag.page) { + put_page(sk->sk_frag.page); + sk->sk_frag.page = NULL; + } + sock_put(sk); } EXPORT_SYMBOL(sk_common_release); -- cgit v1.2.3 From 9e49e88958feb41ec701fa34b44723dabadbc28c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 24 Sep 2012 02:23:59 +0000 Subject: filter: add XOR instruction for use with X/K SKF_AD_ALU_XOR_X has been added a while ago, but as an 'ancillary' operation that is invoked through a negative offset in K within BPF load operations. Since BPF_MOD has recently been added, BPF_XOR should also be part of the common ALU operations. Removing SKF_AD_ALU_XOR_X might not be an option since this is exposed to user space. Signed-off-by: Daniel Borkmann Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/filter.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index fbe3a8d12570..3d92ebb7fbcf 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -187,6 +187,13 @@ unsigned int sk_run_filter(const struct sk_buff *skb, case BPF_S_ALU_OR_K: A |= K; continue; + case BPF_S_ANC_ALU_XOR_X: + case BPF_S_ALU_XOR_X: + A ^= X; + continue; + case BPF_S_ALU_XOR_K: + A ^= K; + continue; case BPF_S_ALU_LSH_X: A <<= X; continue; @@ -334,9 +341,6 @@ load_b: case BPF_S_ANC_CPU: A = raw_smp_processor_id(); continue; - case BPF_S_ANC_ALU_XOR_X: - A ^= X; - continue; case BPF_S_ANC_NLATTR: { struct nlattr *nla; @@ -483,6 +487,8 @@ int sk_chk_filter(struct sock_filter *filter, unsigned int flen) [BPF_ALU|BPF_AND|BPF_X] = BPF_S_ALU_AND_X, [BPF_ALU|BPF_OR|BPF_K] = BPF_S_ALU_OR_K, [BPF_ALU|BPF_OR|BPF_X] = BPF_S_ALU_OR_X, + [BPF_ALU|BPF_XOR|BPF_K] = BPF_S_ALU_XOR_K, + [BPF_ALU|BPF_XOR|BPF_X] = BPF_S_ALU_XOR_X, [BPF_ALU|BPF_LSH|BPF_K] = BPF_S_ALU_LSH_K, [BPF_ALU|BPF_LSH|BPF_X] = BPF_S_ALU_LSH_X, [BPF_ALU|BPF_RSH|BPF_K] = BPF_S_ALU_RSH_K, -- cgit v1.2.3 From e2bcabec6ea5ba30dd2097dc1566e9957d14117c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 25 Sep 2012 11:32:13 +0000 Subject: net: remove sk_init() helper It seems sk_init() has no value today and even does strange things : # grep . /proc/sys/net/core/?mem_* /proc/sys/net/core/rmem_default:212992 /proc/sys/net/core/rmem_max:131071 /proc/sys/net/core/wmem_default:212992 /proc/sys/net/core/wmem_max:131071 We can remove it completely. Signed-off-by: Eric Dumazet Reviewed-by: Shan Wei Signed-off-by: David S. Miller --- net/core/sock.c | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index 727114cd6f7e..f5a426097236 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1464,19 +1464,6 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) } EXPORT_SYMBOL_GPL(sk_setup_caps); -void __init sk_init(void) -{ - if (totalram_pages <= 4096) { - sysctl_wmem_max = 32767; - sysctl_rmem_max = 32767; - sysctl_wmem_default = 32767; - sysctl_rmem_default = 32767; - } else if (totalram_pages >= 131072) { - sysctl_wmem_max = 131071; - sysctl_rmem_max = 131071; - } -} - /* * Simple resource managers for sockets. */ -- cgit v1.2.3 From 69b08f62e17439ee3d436faf0b9a7ca6fffb78db Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 26 Sep 2012 06:46:57 +0000 Subject: net: use bigger pages in __netdev_alloc_frag We currently use percpu order-0 pages in __netdev_alloc_frag to deliver fragments used by __netdev_alloc_skb() Depending on NIC driver and arch being 32 or 64 bit, it allows a page to be split in several fragments (between 1 and 8), assuming PAGE_SIZE=4096 Switching to bigger pages (32768 bytes for PAGE_SIZE=4096 case) allows : - Better filling of space (the ending hole overhead is less an issue) - Less calls to page allocator or accesses to page->_count - Could allow struct skb_shared_info futures changes without major performance impact. This patch implements a transparent fallback to smaller pages in case of memory pressure. It also uses a standard "struct page_frag" instead of a custom one. Signed-off-by: Eric Dumazet Cc: Alexander Duyck Cc: Benjamin LaHaise Signed-off-by: David S. Miller --- net/core/skbuff.c | 46 ++++++++++++++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 16 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 2ede3cfa8ffa..607a70ff2cc2 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -340,43 +340,57 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size) EXPORT_SYMBOL(build_skb); struct netdev_alloc_cache { - struct page *page; - unsigned int offset; - unsigned int pagecnt_bias; + struct page_frag frag; + /* we maintain a pagecount bias, so that we dont dirty cache line + * containing page->_count every time we allocate a fragment. + */ + unsigned int pagecnt_bias; }; static DEFINE_PER_CPU(struct netdev_alloc_cache, netdev_alloc_cache); -#define NETDEV_PAGECNT_BIAS (PAGE_SIZE / SMP_CACHE_BYTES) +#define NETDEV_FRAG_PAGE_MAX_ORDER get_order(32768) +#define NETDEV_FRAG_PAGE_MAX_SIZE (PAGE_SIZE << NETDEV_FRAG_PAGE_MAX_ORDER) +#define NETDEV_PAGECNT_MAX_BIAS NETDEV_FRAG_PAGE_MAX_SIZE static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) { struct netdev_alloc_cache *nc; void *data = NULL; + int order; unsigned long flags; local_irq_save(flags); nc = &__get_cpu_var(netdev_alloc_cache); - if (unlikely(!nc->page)) { + if (unlikely(!nc->frag.page)) { refill: - nc->page = alloc_page(gfp_mask); - if (unlikely(!nc->page)) - goto end; + for (order = NETDEV_FRAG_PAGE_MAX_ORDER; ;) { + gfp_t gfp = gfp_mask; + + if (order) + gfp |= __GFP_COMP | __GFP_NOWARN; + nc->frag.page = alloc_pages(gfp, order); + if (likely(nc->frag.page)) + break; + if (--order < 0) + goto end; + } + nc->frag.size = PAGE_SIZE << order; recycle: - atomic_set(&nc->page->_count, NETDEV_PAGECNT_BIAS); - nc->pagecnt_bias = NETDEV_PAGECNT_BIAS; - nc->offset = 0; + atomic_set(&nc->frag.page->_count, NETDEV_PAGECNT_MAX_BIAS); + nc->pagecnt_bias = NETDEV_PAGECNT_MAX_BIAS; + nc->frag.offset = 0; } - if (nc->offset + fragsz > PAGE_SIZE) { + if (nc->frag.offset + fragsz > nc->frag.size) { /* avoid unnecessary locked operations if possible */ - if ((atomic_read(&nc->page->_count) == nc->pagecnt_bias) || - atomic_sub_and_test(nc->pagecnt_bias, &nc->page->_count)) + if ((atomic_read(&nc->frag.page->_count) == nc->pagecnt_bias) || + atomic_sub_and_test(nc->pagecnt_bias, &nc->frag.page->_count)) goto recycle; goto refill; } - data = page_address(nc->page) + nc->offset; - nc->offset += fragsz; + data = page_address(nc->frag.page) + nc->frag.offset; + nc->frag.offset += fragsz; nc->pagecnt_bias--; end: local_irq_restore(flags); -- cgit v1.2.3 From f4b549a5ac818722fc13d789584f41f4e00d78b5 Mon Sep 17 00:00:00 2001 From: Weiping Pan Date: Fri, 28 Sep 2012 20:15:30 +0000 Subject: use skb_end_offset() in skb_try_coalesce() Commit ec47ea824774(skb: Add inline helper for getting the skb end offset from head) introduces this helper function, skb_end_offset(), we should make use of it. Signed-off-by: Weiping Pan Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/skbuff.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index d607bae075d5..cdc28598f4ef 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3483,8 +3483,7 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, skb_shinfo(from)->nr_frags > MAX_SKB_FRAGS) return false; - delta = from->truesize - - SKB_TRUESIZE(skb_end_pointer(from) - from->head); + delta = from->truesize - SKB_TRUESIZE(skb_end_offset(from)); } WARN_ON_ONCE(delta < len); -- cgit v1.2.3 From c9e6bc644e557338221e75c242ab12c275a67d1b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 27 Sep 2012 19:29:05 +0000 Subject: net: add gro_cells infrastructure This adds a new include file (include/net/gro_cells.h), to bring GRO (Generic Receive Offload) capability to tunnels, in a modular way. Because tunnels receive path is lockless, and GRO adds a serialization using a napi_struct, I chose to add an array of up to DEFAULT_MAX_NUM_RSS_QUEUES cells, so that multi queue devices wont be slowed down because of GRO layer. skb_get_rx_queue() is used as selector. In the future, we might add optional fanout capabilities, using rxhash for example. With help from Ben Hutchings who reminded me netif_get_num_default_rss_queues() function. Signed-off-by: Eric Dumazet Cc: Ben Hutchings Signed-off-by: David S. Miller --- net/core/dev.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 3e645f3751bf..ba4bb118a756 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2645,6 +2645,8 @@ EXPORT_SYMBOL(dev_queue_xmit); =======================================================================*/ int netdev_max_backlog __read_mostly = 1000; +EXPORT_SYMBOL(netdev_max_backlog); + int netdev_tstamp_prequeue __read_mostly = 1; int netdev_budget __read_mostly = 300; int weight_p __read_mostly = 64; /* old backlog weight */ -- cgit v1.2.3 From edc7d57327bd08bfd04f41531d49b176369db218 Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Mon, 1 Oct 2012 12:32:33 +0000 Subject: netlink: add attributes to fdb interface Later changes need to be able to refer to neighbour attributes when doing fdb_add. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 92575370d9f0..76d4c2c3c89b 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2090,7 +2090,8 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) && (dev->priv_flags & IFF_BRIDGE_PORT)) { master = dev->master; - err = master->netdev_ops->ndo_fdb_add(ndm, dev, addr, + err = master->netdev_ops->ndo_fdb_add(ndm, tb, + dev, addr, nlh->nlmsg_flags); if (err) goto out; @@ -2100,7 +2101,8 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) /* Embedded bridge, macvlan, and any other device support */ if ((ndm->ndm_flags & NTF_SELF) && dev->netdev_ops->ndo_fdb_add) { - err = dev->netdev_ops->ndo_fdb_add(ndm, dev, addr, + err = dev->netdev_ops->ndo_fdb_add(ndm, tb, + dev, addr, nlh->nlmsg_flags); if (!err) { -- cgit v1.2.3