diff options
Diffstat (limited to 'net')
142 files changed, 3519 insertions, 1534 deletions
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 9649579b5b9f..71c3e045505b 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -138,7 +138,7 @@ int vlan_check_real_dev(struct net_device *real_dev, return 0; } -int register_vlan_dev(struct net_device *dev) +int register_vlan_dev(struct net_device *dev, struct netlink_ext_ack *extack) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct net_device *real_dev = vlan->real_dev; @@ -174,7 +174,7 @@ int register_vlan_dev(struct net_device *dev) if (err < 0) goto out_uninit_mvrp; - err = netdev_upper_dev_link(real_dev, dev); + err = netdev_upper_dev_link(real_dev, dev, extack); if (err) goto out_unregister_netdev; @@ -270,7 +270,7 @@ static int register_vlan_device(struct net_device *real_dev, u16 vlan_id) vlan->flags = VLAN_FLAG_REORDER_HDR; new_dev->rtnl_link_ops = &vlan_link_ops; - err = register_vlan_dev(new_dev); + err = register_vlan_dev(new_dev, NULL); if (err < 0) goto out_free_newdev; diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h index df8bd65dd370..94f8eed9f9b3 100644 --- a/net/8021q/vlan.h +++ b/net/8021q/vlan.h @@ -107,7 +107,7 @@ void vlan_dev_get_realdev_name(const struct net_device *dev, char *result); int vlan_check_real_dev(struct net_device *real_dev, __be16 protocol, u16 vlan_id); void vlan_setup(struct net_device *dev); -int register_vlan_dev(struct net_device *dev); +int register_vlan_dev(struct net_device *dev, struct netlink_ext_ack *extack); void unregister_vlan_dev(struct net_device *dev, struct list_head *head); bool vlan_dev_inherit_address(struct net_device *dev, struct net_device *real_dev); diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c index e2ed69850489..0bc31de9071a 100644 --- a/net/8021q/vlan_core.c +++ b/net/8021q/vlan_core.c @@ -21,6 +21,12 @@ bool vlan_do_receive(struct sk_buff **skbp) if (unlikely(!skb)) return false; + if (unlikely(!(vlan_dev->flags & IFF_UP))) { + kfree_skb(skb); + *skbp = NULL; + return false; + } + skb->dev = vlan_dev; if (unlikely(skb->pkt_type == PACKET_OTHERHOST)) { /* Our lower layer thinks this is not local, let's make sure. diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c index 5e831de3103e..6e7c5a6a7930 100644 --- a/net/8021q/vlan_netlink.c +++ b/net/8021q/vlan_netlink.c @@ -160,7 +160,7 @@ static int vlan_newlink(struct net *src_net, struct net_device *dev, if (err < 0) return err; - return register_vlan_dev(dev); + return register_vlan_dev(dev, extack); } static inline size_t vlan_qos_map_size(unsigned int n) diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index e348f76ea8c1..f7b413b9297e 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -738,7 +738,7 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, bat_priv = netdev_priv(hard_iface->soft_iface); ret = netdev_master_upper_dev_link(hard_iface->net_dev, - soft_iface, NULL, NULL); + soft_iface, NULL, NULL, NULL); if (ret) goto err_dev; diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 10f7edfb176e..e7d5fbb6ad53 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -69,7 +69,7 @@ int batadv_skb_head_push(struct sk_buff *skb, unsigned int len) int result; /* TODO: We must check if we can release all references to non-payload - * data using skb_header_release in our skbs to allow skb_cow_header to + * data using __skb_header_release in our skbs to allow skb_cow_header to * work optimally. This means that those skbs are not allowed to read * or write any data which is before the current position of skb->data * after that call and thus allow other skbs with the same data buffer @@ -867,7 +867,8 @@ free_bat_counters: * Return: 0 if successful or error otherwise. */ static int batadv_softif_slave_add(struct net_device *dev, - struct net_device *slave_dev) + struct net_device *slave_dev, + struct netlink_ext_ack *extack) { struct batadv_hard_iface *hard_iface; struct net *net = dev_net(dev); diff --git a/net/bluetooth/Kconfig b/net/bluetooth/Kconfig index c18115d22f00..db82a40875e8 100644 --- a/net/bluetooth/Kconfig +++ b/net/bluetooth/Kconfig @@ -126,14 +126,4 @@ config BT_DEBUGFS Provide extensive information about internal Bluetooth states in debugfs. -config BT_LEGACY_IOCTL - bool "Enable legacy ioctl interfaces" - depends on BT && BT_BREDR - default y - help - Enable support for legacy ioctl interfaces. This is only needed - for old and deprecated applications using direct ioctl calls for - controller management. Since Linux 3.4 all configuration and - setup is done via mgmt interface and this is no longer needed. - source "drivers/bluetooth/Kconfig" diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 0bad296fe0af..65d734c165bd 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -878,7 +878,6 @@ static int hci_sock_release(struct socket *sock) return 0; } -#ifdef CONFIG_BT_LEGACY_IOCTL static int hci_sock_blacklist_add(struct hci_dev *hdev, void __user *arg) { bdaddr_t bdaddr; @@ -1050,7 +1049,6 @@ done: release_sock(sk); return err; } -#endif static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len) @@ -1971,11 +1969,7 @@ static const struct proto_ops hci_sock_ops = { .getname = hci_sock_getname, .sendmsg = hci_sock_sendmsg, .recvmsg = hci_sock_recvmsg, -#ifdef CONFIG_BT_LEGACY_IOCTL .ioctl = hci_sock_ioctl, -#else - .ioctl = sock_no_ioctl, -#endif .poll = datagram_poll, .listen = sock_no_listen, .shutdown = sock_no_shutdown, diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 6be41a44d688..a86e6687026e 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -133,7 +133,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, if (is_l2) __skb_push(skb, ETH_HLEN); if (is_direct_pkt_access) - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); retval = bpf_test_run(prog, skb, repeat, &duration); if (!is_l2) __skb_push(skb, ETH_HLEN); @@ -162,6 +162,7 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, xdp.data_hard_start = data; xdp.data = data + XDP_PACKET_HEADROOM + NET_IP_ALIGN; + xdp.data_meta = xdp.data; xdp.data_end = xdp.data + size; retval = bpf_test_run(prog, &xdp, repeat, &duration); diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index f6b6a92f1c48..7acb77c9bd65 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -320,12 +320,13 @@ void br_netpoll_disable(struct net_bridge_port *p) #endif -static int br_add_slave(struct net_device *dev, struct net_device *slave_dev) +static int br_add_slave(struct net_device *dev, struct net_device *slave_dev, + struct netlink_ext_ack *extack) { struct net_bridge *br = netdev_priv(dev); - return br_add_if(br, slave_dev); + return br_add_if(br, slave_dev, extack); } static int br_del_slave(struct net_device *dev, struct net_device *slave_dev) diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index f3aef22931ab..59a74a414e20 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -480,7 +480,8 @@ netdev_features_t br_features_recompute(struct net_bridge *br, } /* called with RTNL */ -int br_add_if(struct net_bridge *br, struct net_device *dev) +int br_add_if(struct net_bridge *br, struct net_device *dev, + struct netlink_ext_ack *extack) { struct net_bridge_port *p; int err = 0; @@ -500,16 +501,22 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) return -EINVAL; /* No bridging of bridges */ - if (dev->netdev_ops->ndo_start_xmit == br_dev_xmit) + if (dev->netdev_ops->ndo_start_xmit == br_dev_xmit) { + NL_SET_ERR_MSG(extack, + "Can not enslave a bridge to a bridge"); return -ELOOP; + } /* Device is already being bridged */ if (br_port_exists(dev)) return -EBUSY; /* No bridging devices that dislike that (e.g. wireless) */ - if (dev->priv_flags & IFF_DONT_BRIDGE) + if (dev->priv_flags & IFF_DONT_BRIDGE) { + NL_SET_ERR_MSG(extack, + "Device does not allow enslaving to a bridge"); return -EOPNOTSUPP; + } p = new_nbp(br, dev); if (IS_ERR(p)) @@ -540,7 +547,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) dev->priv_flags |= IFF_BRIDGE_PORT; - err = netdev_master_upper_dev_link(dev, br->dev, NULL, NULL); + err = netdev_master_upper_dev_link(dev, br->dev, NULL, NULL, extack); if (err) goto err5; diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 7637f58c1226..7cb613776b31 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -289,6 +289,7 @@ rx_handler_result_t br_handle_frame(struct sk_buff **pskb) * * Others reserved for future standardization */ + fwd_mask |= p->group_fwd_mask; switch (dest[5]) { case 0x00: /* Bridge Group Address */ /* If STP is turned off, diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c index 7970f8540cbb..8f29103935a3 100644 --- a/net/bridge/br_ioctl.c +++ b/net/bridge/br_ioctl.c @@ -98,10 +98,13 @@ static int add_del_if(struct net_bridge *br, int ifindex, int isadd) return -EINVAL; if (isadd) - ret = br_add_if(br, dev); + ret = br_add_if(br, dev, NULL); else ret = br_del_if(br, dev); + if (!ret) + rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_MASTER, GFP_KERNEL); + return ret; } diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 3bc890716c89..dea88a255d26 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -152,6 +152,7 @@ static inline size_t br_port_info_size(void) #ifdef CONFIG_BRIDGE_IGMP_SNOOPING + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MULTICAST_ROUTER */ #endif + + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_GROUP_FWD_MASK */ + 0; } @@ -208,7 +209,8 @@ static int br_port_fill_attrs(struct sk_buff *skb, p->topology_change_ack) || nla_put_u8(skb, IFLA_BRPORT_CONFIG_PENDING, p->config_pending) || nla_put_u8(skb, IFLA_BRPORT_VLAN_TUNNEL, !!(p->flags & - BR_VLAN_TUNNEL))) + BR_VLAN_TUNNEL)) || + nla_put_u16(skb, IFLA_BRPORT_GROUP_FWD_MASK, p->group_fwd_mask)) return -EMSGSIZE; timerval = br_timer_value(&p->message_age_timer); @@ -637,6 +639,7 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = { [IFLA_BRPORT_MCAST_TO_UCAST] = { .type = NLA_U8 }, [IFLA_BRPORT_MCAST_FLOOD] = { .type = NLA_U8 }, [IFLA_BRPORT_BCAST_FLOOD] = { .type = NLA_U8 }, + [IFLA_BRPORT_GROUP_FWD_MASK] = { .type = NLA_U16 }, }; /* Change the state of the port and notify spanning tree */ @@ -773,6 +776,15 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[]) return err; } #endif + + if (tb[IFLA_BRPORT_GROUP_FWD_MASK]) { + u16 fwd_mask = nla_get_u16(tb[IFLA_BRPORT_GROUP_FWD_MASK]); + + if (fwd_mask & BR_GROUPFWD_MACPAUSE) + return -EINVAL; + p->group_fwd_mask = fwd_mask; + } + br_port_flags_change(p, old_flags ^ p->flags); return 0; } diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index e870cfc85b14..ab4df24f7bba 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -36,7 +36,14 @@ /* Control of forwarding link local multicast */ #define BR_GROUPFWD_DEFAULT 0 /* Don't allow forwarding of control protocols like STP, MAC PAUSE and LACP */ -#define BR_GROUPFWD_RESTRICTED 0x0007u +enum { + BR_GROUPFWD_STP = BIT(0), + BR_GROUPFWD_MACPAUSE = BIT(1), + BR_GROUPFWD_LACP = BIT(2), +}; + +#define BR_GROUPFWD_RESTRICTED (BR_GROUPFWD_STP | BR_GROUPFWD_MACPAUSE | \ + BR_GROUPFWD_LACP) /* The Nearest Customer Bridge Group Address, 01-80-C2-00-00-[00,0B,0C,0D,0F] */ #define BR_GROUPFWD_8021AD 0xB801u @@ -268,6 +275,7 @@ struct net_bridge_port { #ifdef CONFIG_NET_SWITCHDEV int offload_fwd_mark; #endif + u16 group_fwd_mask; }; #define br_auto_port(p) ((p)->flags & BR_AUTO_MASK) @@ -558,7 +566,8 @@ void br_flood(struct net_bridge *br, struct sk_buff *skb, void br_port_carrier_check(struct net_bridge_port *p); int br_add_bridge(struct net *net, const char *name); int br_del_bridge(struct net *net, const char *name); -int br_add_if(struct net_bridge *br, struct net_device *dev); +int br_add_if(struct net_bridge *br, struct net_device *dev, + struct netlink_ext_ack *extack); int br_del_if(struct net_bridge *br, struct net_device *dev); int br_min_mtu(const struct net_bridge *br); netdev_features_t br_features_recompute(struct net_bridge *br, diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c index 5d5d413a6cf8..9110d5e56085 100644 --- a/net/bridge/br_sysfs_if.c +++ b/net/bridge/br_sysfs_if.c @@ -165,6 +165,23 @@ static int store_flush(struct net_bridge_port *p, unsigned long v) } static BRPORT_ATTR(flush, S_IWUSR, NULL, store_flush); +static ssize_t show_group_fwd_mask(struct net_bridge_port *p, char *buf) +{ + return sprintf(buf, "%#x\n", p->group_fwd_mask); +} + +static int store_group_fwd_mask(struct net_bridge_port *p, + unsigned long v) +{ + if (v & BR_GROUPFWD_MACPAUSE) + return -EINVAL; + p->group_fwd_mask = v; + + return 0; +} +static BRPORT_ATTR(group_fwd_mask, S_IRUGO | S_IWUSR, show_group_fwd_mask, + store_group_fwd_mask); + BRPORT_ATTR_FLAG(hairpin_mode, BR_HAIRPIN_MODE); BRPORT_ATTR_FLAG(bpdu_guard, BR_BPDU_GUARD); BRPORT_ATTR_FLAG(root_block, BR_ROOT_BLOCK); @@ -223,6 +240,7 @@ static const struct brport_attribute *brport_attrs[] = { &brport_attr_proxyarp_wifi, &brport_attr_multicast_flood, &brport_attr_broadcast_flood, + &brport_attr_group_fwd_mask, NULL }; diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index f358d0bfa76b..79d14d70b7ea 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -2445,19 +2445,34 @@ static void apply_upmap(struct ceph_osdmap *osdmap, pg = lookup_pg_mapping(&osdmap->pg_upmap_items, pgid); if (pg) { - for (i = 0; i < raw->size; i++) { - for (j = 0; j < pg->pg_upmap_items.len; j++) { - int from = pg->pg_upmap_items.from_to[j][0]; - int to = pg->pg_upmap_items.from_to[j][1]; - - if (from == raw->osds[i]) { - if (!(to != CRUSH_ITEM_NONE && - to < osdmap->max_osd && - osdmap->osd_weight[to] == 0)) - raw->osds[i] = to; + /* + * Note: this approach does not allow a bidirectional swap, + * e.g., [[1,2],[2,1]] applied to [0,1,2] -> [0,2,1]. + */ + for (i = 0; i < pg->pg_upmap_items.len; i++) { + int from = pg->pg_upmap_items.from_to[i][0]; + int to = pg->pg_upmap_items.from_to[i][1]; + int pos = -1; + bool exists = false; + + /* make sure replacement doesn't already appear */ + for (j = 0; j < raw->size; j++) { + int osd = raw->osds[j]; + + if (osd == to) { + exists = true; break; } + /* ignore mapping if target is marked out */ + if (osd == from && pos < 0 && + !(to != CRUSH_ITEM_NONE && + to < osdmap->max_osd && + osdmap->osd_weight[to] == 0)) { + pos = j; + } } + if (!exists && pos >= 0) + raw->osds[pos] = to; } } } diff --git a/net/compat.c b/net/compat.c index 6ded6c821d7a..22381719718c 100644 --- a/net/compat.c +++ b/net/compat.c @@ -185,6 +185,13 @@ int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg, struct sock *sk, ucmsg = cmsg_compat_nxthdr(kmsg, ucmsg, ucmlen); } + /* + * check the length of messages copied in is the same as the + * what we get from the first loop + */ + if ((char *)kcmsg - (char *)kcmsg_base != kcmlen) + goto Einval; + /* Ok, looks like we made it. Hook it up and return success. */ kmsg->msg_control = kcmsg_base; kmsg->msg_controllen = kcmlen; diff --git a/net/core/datagram.c b/net/core/datagram.c index f7fb7e3f2acf..0b7b4c22719e 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -188,7 +188,7 @@ struct sk_buff *__skb_try_recv_from_queue(struct sock *sk, } if (!skb->len) { skb = skb_set_peeked(skb); - if (unlikely(IS_ERR(skb))) { + if (IS_ERR(skb)) { *err = PTR_ERR(skb); return NULL; } diff --git a/net/core/dev.c b/net/core/dev.c index fb766d906148..fcddccb6be41 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -145,6 +145,7 @@ #include <linux/crash_dump.h> #include <linux/sctp.h> #include <net/udp_tunnel.h> +#include <linux/net_namespace.h> #include "net-sysfs.h" @@ -162,7 +163,6 @@ static struct list_head offload_base __read_mostly; static int netif_rx_internal(struct sk_buff *skb); static int call_netdevice_notifiers_info(unsigned long val, - struct net_device *dev, struct netdev_notifier_info *info); static struct napi_struct *napi_by_id(unsigned int napi_id); @@ -188,6 +188,8 @@ static struct napi_struct *napi_by_id(unsigned int napi_id); DEFINE_RWLOCK(dev_base_lock); EXPORT_SYMBOL(dev_base_lock); +static DEFINE_MUTEX(ifalias_mutex); + /* protects napi_hash addition/deletion and napi_gen_id */ static DEFINE_SPINLOCK(napi_hash_lock); @@ -1265,29 +1267,53 @@ rollback: */ int dev_set_alias(struct net_device *dev, const char *alias, size_t len) { - char *new_ifalias; - - ASSERT_RTNL(); + struct dev_ifalias *new_alias = NULL; if (len >= IFALIASZ) return -EINVAL; - if (!len) { - kfree(dev->ifalias); - dev->ifalias = NULL; - return 0; + if (len) { + new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL); + if (!new_alias) + return -ENOMEM; + + memcpy(new_alias->ifalias, alias, len); + new_alias->ifalias[len] = 0; } - new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL); - if (!new_ifalias) - return -ENOMEM; - dev->ifalias = new_ifalias; - memcpy(dev->ifalias, alias, len); - dev->ifalias[len] = 0; + mutex_lock(&ifalias_mutex); + rcu_swap_protected(dev->ifalias, new_alias, + mutex_is_locked(&ifalias_mutex)); + mutex_unlock(&ifalias_mutex); + + if (new_alias) + kfree_rcu(new_alias, rcuhead); return len; } +/** + * dev_get_alias - get ifalias of a device + * @dev: device + * @name: buffer to store name of ifalias + * @len: size of buffer + * + * get ifalias for a device. Caller must make sure dev cannot go + * away, e.g. rcu read lock or own a reference count to device. + */ +int dev_get_alias(const struct net_device *dev, char *name, size_t len) +{ + const struct dev_ifalias *alias; + int ret = 0; + + rcu_read_lock(); + alias = rcu_dereference(dev->ifalias); + if (alias) + ret = snprintf(name, len, "%s", alias->ifalias); + rcu_read_unlock(); + + return ret; +} /** * netdev_features_change - device changes features @@ -1312,10 +1338,11 @@ EXPORT_SYMBOL(netdev_features_change); void netdev_state_change(struct net_device *dev) { if (dev->flags & IFF_UP) { - struct netdev_notifier_change_info change_info; + struct netdev_notifier_change_info change_info = { + .info.dev = dev, + }; - change_info.flags_changed = 0; - call_netdevice_notifiers_info(NETDEV_CHANGE, dev, + call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info); rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL); } @@ -1536,9 +1563,10 @@ EXPORT_SYMBOL(dev_disable_lro); static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val, struct net_device *dev) { - struct netdev_notifier_info info; + struct netdev_notifier_info info = { + .dev = dev, + }; - netdev_notifier_info_init(&info, dev); return nb->notifier_call(nb, val, &info); } @@ -1663,11 +1691,9 @@ EXPORT_SYMBOL(unregister_netdevice_notifier); */ static int call_netdevice_notifiers_info(unsigned long val, - struct net_device *dev, struct netdev_notifier_info *info) { ASSERT_RTNL(); - netdev_notifier_info_init(info, dev); return raw_notifier_call_chain(&netdev_chain, val, info); } @@ -1682,9 +1708,11 @@ static int call_netdevice_notifiers_info(unsigned long val, int call_netdevice_notifiers(unsigned long val, struct net_device *dev) { - struct netdev_notifier_info info; + struct netdev_notifier_info info = { + .dev = dev, + }; - return call_netdevice_notifiers_info(val, dev, &info); + return call_netdevice_notifiers_info(val, &info); } EXPORT_SYMBOL(call_netdevice_notifiers); @@ -1948,8 +1976,12 @@ again: goto again; } out_unlock: - if (pt_prev) - pt_prev->func(skb2, skb->dev, pt_prev, skb->dev); + if (pt_prev) { + if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC)) + pt_prev->func(skb2, skb->dev, pt_prev, skb->dev); + else + kfree_skb(skb2); + } rcu_read_unlock(); } EXPORT_SYMBOL_GPL(dev_queue_xmit_nit); @@ -3860,8 +3892,8 @@ drop: static u32 netif_receive_generic_xdp(struct sk_buff *skb, struct bpf_prog *xdp_prog) { + u32 metalen, act = XDP_DROP; struct xdp_buff xdp; - u32 act = XDP_DROP; void *orig_data; int hlen, off; u32 mac_len; @@ -3872,8 +3904,25 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, if (skb_cloned(skb)) return XDP_PASS; - if (skb_linearize(skb)) - goto do_drop; + /* XDP packets must be linear and must have sufficient headroom + * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also + * native XDP provides, thus we need to do it here as well. + */ + if (skb_is_nonlinear(skb) || + skb_headroom(skb) < XDP_PACKET_HEADROOM) { + int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb); + int troom = skb->tail + skb->data_len - skb->end; + + /* In case we have to go down the path and also linearize, + * then lets do the pskb_expand_head() work just once here. + */ + if (pskb_expand_head(skb, + hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0, + troom > 0 ? troom + 128 : 0, GFP_ATOMIC)) + goto do_drop; + if (troom > 0 && __skb_linearize(skb)) + goto do_drop; + } /* The XDP program wants to see the packet starting at the MAC * header. @@ -3881,6 +3930,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, mac_len = skb->data - skb_mac_header(skb); hlen = skb_headlen(skb) + mac_len; xdp.data = skb->data - mac_len; + xdp.data_meta = xdp.data; xdp.data_end = xdp.data + hlen; xdp.data_hard_start = skb->data - skb_headroom(skb); orig_data = xdp.data; @@ -3892,15 +3942,18 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, __skb_pull(skb, off); else if (off < 0) __skb_push(skb, -off); + skb->mac_header += off; switch (act) { case XDP_REDIRECT: case XDP_TX: __skb_push(skb, mac_len); - /* fall through */ + break; case XDP_PASS: + metalen = xdp.data - xdp.data_meta; + if (metalen) + skb_metadata_set(skb, metalen); break; - default: bpf_warn_invalid_xdp_action(act); /* fall through */ @@ -4690,6 +4743,7 @@ static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb) diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; diffs |= p->vlan_tci ^ skb->vlan_tci; diffs |= skb_metadata_dst_cmp(p, skb); + diffs |= skb_metadata_differs(p, skb); if (maclen == ETH_HLEN) diffs |= compare_ether_header(skb_mac_header(p), skb_mac_header(skb)); @@ -6223,9 +6277,19 @@ static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev, static int __netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, bool master, - void *upper_priv, void *upper_info) -{ - struct netdev_notifier_changeupper_info changeupper_info; + void *upper_priv, void *upper_info, + struct netlink_ext_ack *extack) +{ + struct netdev_notifier_changeupper_info changeupper_info = { + .info = { + .dev = dev, + .extack = extack, + }, + .upper_dev = upper_dev, + .master = master, + .linking = true, + .upper_info = upper_info, + }; int ret = 0; ASSERT_RTNL(); @@ -6243,12 +6307,7 @@ static int __netdev_upper_dev_link(struct net_device *dev, if (master && netdev_master_upper_dev_get(dev)) return -EBUSY; - changeupper_info.upper_dev = upper_dev; - changeupper_info.master = master; - changeupper_info.linking = true; - changeupper_info.upper_info = upper_info; - - ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev, + ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, &changeupper_info.info); ret = notifier_to_errno(ret); if (ret) @@ -6259,7 +6318,7 @@ static int __netdev_upper_dev_link(struct net_device *dev, if (ret) return ret; - ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev, + ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, &changeupper_info.info); ret = notifier_to_errno(ret); if (ret) @@ -6284,9 +6343,11 @@ rollback: * returns zero. */ int netdev_upper_dev_link(struct net_device *dev, - struct net_device *upper_dev) + struct net_device *upper_dev, + struct netlink_ext_ack *extack) { - return __netdev_upper_dev_link(dev, upper_dev, false, NULL, NULL); + return __netdev_upper_dev_link(dev, upper_dev, false, + NULL, NULL, extack); } EXPORT_SYMBOL(netdev_upper_dev_link); @@ -6305,10 +6366,11 @@ EXPORT_SYMBOL(netdev_upper_dev_link); */ int netdev_master_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, - void *upper_priv, void *upper_info) + void *upper_priv, void *upper_info, + struct netlink_ext_ack *extack) { return __netdev_upper_dev_link(dev, upper_dev, true, - upper_priv, upper_info); + upper_priv, upper_info, extack); } EXPORT_SYMBOL(netdev_master_upper_dev_link); @@ -6323,20 +6385,24 @@ EXPORT_SYMBOL(netdev_master_upper_dev_link); void netdev_upper_dev_unlink(struct net_device *dev, struct net_device *upper_dev) { - struct netdev_notifier_changeupper_info changeupper_info; + struct netdev_notifier_changeupper_info changeupper_info = { + .info = { + .dev = dev, + }, + .upper_dev = upper_dev, + .linking = false, + }; ASSERT_RTNL(); - changeupper_info.upper_dev = upper_dev; changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev; - changeupper_info.linking = false; - call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev, + call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, &changeupper_info.info); __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); - call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev, + call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, &changeupper_info.info); } EXPORT_SYMBOL(netdev_upper_dev_unlink); @@ -6352,11 +6418,13 @@ EXPORT_SYMBOL(netdev_upper_dev_unlink); void netdev_bonding_info_change(struct net_device *dev, struct netdev_bonding_info *bonding_info) { - struct netdev_notifier_bonding_info info; + struct netdev_notifier_bonding_info info = { + .info.dev = dev, + }; memcpy(&info.bonding_info, bonding_info, sizeof(struct netdev_bonding_info)); - call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev, + call_netdevice_notifiers_info(NETDEV_BONDING_INFO, &info.info); } EXPORT_SYMBOL(netdev_bonding_info_change); @@ -6482,11 +6550,13 @@ EXPORT_SYMBOL(dev_get_nest_level); void netdev_lower_state_changed(struct net_device *lower_dev, void *lower_state_info) { - struct netdev_notifier_changelowerstate_info changelowerstate_info; + struct netdev_notifier_changelowerstate_info changelowerstate_info = { + .info.dev = lower_dev, + }; ASSERT_RTNL(); changelowerstate_info.lower_state_info = lower_state_info; - call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, lower_dev, + call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, &changelowerstate_info.info); } EXPORT_SYMBOL(netdev_lower_state_changed); @@ -6777,11 +6847,14 @@ void __dev_notify_flags(struct net_device *dev, unsigned int old_flags, if (dev->flags & IFF_UP && (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) { - struct netdev_notifier_change_info change_info; + struct netdev_notifier_change_info change_info = { + .info = { + .dev = dev, + }, + .flags_changed = changes, + }; - change_info.flags_changed = changes; - call_netdevice_notifiers_info(NETDEV_CHANGE, dev, - &change_info.info); + call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info); } } @@ -7152,7 +7225,7 @@ static void rollback_registered_many(struct list_head *head) if (!dev->rtnl_link_ops || dev->rtnl_link_state == RTNL_LINK_INITIALIZED) skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0, - GFP_KERNEL); + GFP_KERNEL, NULL); /* * Flush the unicast and multicast chains @@ -7989,7 +8062,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, unsigned int txqs, unsigned int rxqs) { struct net_device *dev; - size_t alloc_size; + unsigned int alloc_size; struct net_device *p; BUG_ON(strlen(name) >= sizeof(dev->name)); @@ -8239,7 +8312,7 @@ EXPORT_SYMBOL(unregister_netdev); int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat) { - int err; + int err, new_nsid; ASSERT_RTNL(); @@ -8295,7 +8368,11 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char call_netdevice_notifiers(NETDEV_UNREGISTER, dev); rcu_barrier(); call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); - rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL); + if (dev->rtnl_link_ops && dev->rtnl_link_ops->get_link_net) + new_nsid = peernet2id_alloc(dev_net(dev), net); + else + new_nsid = peernet2id(dev_net(dev), net); + rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid); /* * Flush the unicast and multicast chains diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 6a582ae4c5d9..3228411ada0f 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -525,6 +525,8 @@ convert_link_ksettings_to_legacy_settings( = link_ksettings->base.eth_tp_mdix; legacy_settings->eth_tp_mdix_ctrl = link_ksettings->base.eth_tp_mdix_ctrl; + legacy_settings->transceiver + = link_ksettings->base.transceiver; return retval; } diff --git a/net/core/filter.c b/net/core/filter.c index 24dd33dd9f04..b7e8caa1e790 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -989,10 +989,14 @@ static bool __sk_filter_charge(struct sock *sk, struct sk_filter *fp) bool sk_filter_charge(struct sock *sk, struct sk_filter *fp) { - bool ret = __sk_filter_charge(sk, fp); - if (ret) - refcount_inc(&fp->refcnt); - return ret; + if (!refcount_inc_not_zero(&fp->refcnt)) + return false; + + if (!__sk_filter_charge(sk, fp)) { + sk_filter_release(fp); + return false; + } + return true; } static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) @@ -1402,7 +1406,7 @@ static inline int bpf_try_make_writable(struct sk_buff *skb, { int err = __bpf_try_make_writable(skb, write_len); - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); return err; } @@ -1794,7 +1798,7 @@ struct redirect_info { u32 flags; struct bpf_map *map; struct bpf_map *map_to_flush; - const struct bpf_prog *map_owner; + unsigned long map_owner; }; static DEFINE_PER_CPU(struct redirect_info, redirect_info); @@ -1962,7 +1966,7 @@ BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto, ret = skb_vlan_push(skb, vlan_proto, vlan_tci); bpf_pull_mac_rcsum(skb); - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); return ret; } @@ -1984,7 +1988,7 @@ BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb) ret = skb_vlan_pop(skb); bpf_pull_mac_rcsum(skb); - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); return ret; } @@ -2178,7 +2182,7 @@ BPF_CALL_3(bpf_skb_change_proto, struct sk_buff *, skb, __be16, proto, * need to be verified first. */ ret = bpf_skb_proto_xlat(skb, proto); - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); return ret; } @@ -2303,7 +2307,7 @@ static int bpf_skb_adjust_net(struct sk_buff *skb, s32 len_diff) ret = shrink ? bpf_skb_net_shrink(skb, len_diff_abs) : bpf_skb_net_grow(skb, len_diff_abs); - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); return ret; } @@ -2394,7 +2398,7 @@ BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len, skb_gso_reset(skb); } - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); return ret; } @@ -2434,7 +2438,7 @@ BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room, skb_reset_mac_header(skb); } - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); return 0; } @@ -2447,14 +2451,26 @@ static const struct bpf_func_proto bpf_skb_change_head_proto = { .arg3_type = ARG_ANYTHING, }; +static unsigned long xdp_get_metalen(const struct xdp_buff *xdp) +{ + return xdp_data_meta_unsupported(xdp) ? 0 : + xdp->data - xdp->data_meta; +} + BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset) { + unsigned long metalen = xdp_get_metalen(xdp); + void *data_start = xdp->data_hard_start + metalen; void *data = xdp->data + offset; - if (unlikely(data < xdp->data_hard_start || + if (unlikely(data < data_start || data > xdp->data_end - ETH_HLEN)) return -EINVAL; + if (metalen) + memmove(xdp->data_meta + offset, + xdp->data_meta, metalen); + xdp->data_meta += offset; xdp->data = data; return 0; @@ -2468,6 +2484,33 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset) +{ + void *meta = xdp->data_meta + offset; + unsigned long metalen = xdp->data - meta; + + if (xdp_data_meta_unsupported(xdp)) + return -ENOTSUPP; + if (unlikely(meta < xdp->data_hard_start || + meta > xdp->data)) + return -EINVAL; + if (unlikely((metalen & (sizeof(__u32) - 1)) || + (metalen > 32))) + return -EACCES; + + xdp->data_meta = meta; + + return 0; +} + +static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = { + .func = bpf_xdp_adjust_meta, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + static int __bpf_tx_xdp(struct net_device *dev, struct bpf_map *map, struct xdp_buff *xdp, @@ -2500,11 +2543,17 @@ void xdp_do_flush_map(void) } EXPORT_SYMBOL_GPL(xdp_do_flush_map); +static inline bool xdp_map_invalid(const struct bpf_prog *xdp_prog, + unsigned long aux) +{ + return (unsigned long)xdp_prog->aux != aux; +} + static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { struct redirect_info *ri = this_cpu_ptr(&redirect_info); - const struct bpf_prog *map_owner = ri->map_owner; + unsigned long map_owner = ri->map_owner; struct bpf_map *map = ri->map; struct net_device *fwd = NULL; u32 index = ri->ifindex; @@ -2512,9 +2561,9 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, ri->ifindex = 0; ri->map = NULL; - ri->map_owner = NULL; + ri->map_owner = 0; - if (unlikely(map_owner != xdp_prog)) { + if (unlikely(xdp_map_invalid(xdp_prog, map_owner))) { err = -EFAULT; map = NULL; goto err; @@ -2574,7 +2623,7 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, struct bpf_prog *xdp_prog) { struct redirect_info *ri = this_cpu_ptr(&redirect_info); - const struct bpf_prog *map_owner = ri->map_owner; + unsigned long map_owner = ri->map_owner; struct bpf_map *map = ri->map; struct net_device *fwd = NULL; u32 index = ri->ifindex; @@ -2583,10 +2632,10 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, ri->ifindex = 0; ri->map = NULL; - ri->map_owner = NULL; + ri->map_owner = 0; if (map) { - if (unlikely(map_owner != xdp_prog)) { + if (unlikely(xdp_map_invalid(xdp_prog, map_owner))) { err = -EFAULT; map = NULL; goto err; @@ -2632,7 +2681,7 @@ BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags) ri->ifindex = ifindex; ri->flags = flags; ri->map = NULL; - ri->map_owner = NULL; + ri->map_owner = 0; return XDP_REDIRECT; } @@ -2646,7 +2695,7 @@ static const struct bpf_func_proto bpf_xdp_redirect_proto = { }; BPF_CALL_4(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, u64, flags, - const struct bpf_prog *, map_owner) + unsigned long, map_owner) { struct redirect_info *ri = this_cpu_ptr(&redirect_info); @@ -2686,7 +2735,8 @@ bool bpf_helper_changes_pkt_data(void *func) func == bpf_clone_redirect || func == bpf_l3_csum_replace || func == bpf_l4_csum_replace || - func == bpf_xdp_adjust_head) + func == bpf_xdp_adjust_head || + func == bpf_xdp_adjust_meta) return true; return false; @@ -3282,6 +3332,8 @@ xdp_func_proto(enum bpf_func_id func_id) return &bpf_get_smp_processor_id_proto; case BPF_FUNC_xdp_adjust_head: return &bpf_xdp_adjust_head_proto; + case BPF_FUNC_xdp_adjust_meta: + return &bpf_xdp_adjust_meta_proto; case BPF_FUNC_redirect: return &bpf_xdp_redirect_proto; case BPF_FUNC_redirect_map: @@ -3412,6 +3464,7 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type case bpf_ctx_range_till(struct __sk_buff, remote_ip4, remote_ip4): case bpf_ctx_range_till(struct __sk_buff, local_ip4, local_ip4): case bpf_ctx_range(struct __sk_buff, data): + case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, data_end): if (size != size_default) return false; @@ -3438,6 +3491,7 @@ static bool sk_filter_is_valid_access(int off, int size, switch (off) { case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, data): + case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, data_end): case bpf_ctx_range_till(struct __sk_buff, family, local_port): return false; @@ -3462,6 +3516,7 @@ static bool lwt_is_valid_access(int off, int size, switch (off) { case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range_till(struct __sk_buff, family, local_port): + case bpf_ctx_range(struct __sk_buff, data_meta): return false; } @@ -3580,6 +3635,9 @@ static bool tc_cls_act_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, data): info->reg_type = PTR_TO_PACKET; break; + case bpf_ctx_range(struct __sk_buff, data_meta): + info->reg_type = PTR_TO_PACKET_META; + break; case bpf_ctx_range(struct __sk_buff, data_end): info->reg_type = PTR_TO_PACKET_END; break; @@ -3613,6 +3671,9 @@ static bool xdp_is_valid_access(int off, int size, case offsetof(struct xdp_md, data): info->reg_type = PTR_TO_PACKET; break; + case offsetof(struct xdp_md, data_meta): + info->reg_type = PTR_TO_PACKET_META; + break; case offsetof(struct xdp_md, data_end): info->reg_type = PTR_TO_PACKET_END; break; @@ -3671,6 +3732,12 @@ static bool sk_skb_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { + switch (off) { + case bpf_ctx_range(struct __sk_buff, tc_classid): + case bpf_ctx_range(struct __sk_buff, data_meta): + return false; + } + if (type == BPF_WRITE) { switch (off) { case bpf_ctx_range(struct __sk_buff, mark): @@ -3683,8 +3750,6 @@ static bool sk_skb_is_valid_access(int off, int size, } switch (off) { - case bpf_ctx_range(struct __sk_buff, tc_classid): - return false; case bpf_ctx_range(struct __sk_buff, data): info->reg_type = PTR_TO_PACKET; break; @@ -3841,6 +3906,15 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, offsetof(struct sk_buff, data)); break; + case offsetof(struct __sk_buff, data_meta): + off = si->off; + off -= offsetof(struct __sk_buff, data_meta); + off += offsetof(struct sk_buff, cb); + off += offsetof(struct bpf_skb_data_end, data_meta); + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, + si->src_reg, off); + break; + case offsetof(struct __sk_buff, data_end): off = si->off; off -= offsetof(struct __sk_buff, data_end); @@ -4089,6 +4163,11 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, si->dst_reg, si->src_reg, offsetof(struct xdp_buff, data)); break; + case offsetof(struct xdp_md, data_meta): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_meta), + si->dst_reg, si->src_reg, + offsetof(struct xdp_buff, data_meta)); + break; case offsetof(struct xdp_md, data_end): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end), si->dst_reg, si->src_reg, diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 0a977373d003..1f5caafb4492 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -5,6 +5,7 @@ #include <linux/ipv6.h> #include <linux/if_vlan.h> #include <net/dsa.h> +#include <net/dst_metadata.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/gre.h> @@ -115,6 +116,102 @@ __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, } EXPORT_SYMBOL(__skb_flow_get_ports); +static void +skb_flow_dissect_set_enc_addr_type(enum flow_dissector_key_id type, + struct flow_dissector *flow_dissector, + void *target_container) +{ + struct flow_dissector_key_control *ctrl; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_CONTROL)) + return; + + ctrl = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_CONTROL, + target_container); + ctrl->addr_type = type; +} + +static void +__skb_flow_dissect_tunnel_info(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container) +{ + struct ip_tunnel_info *info; + struct ip_tunnel_key *key; + + /* A quick check to see if there might be something to do. */ + if (!dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_KEYID) && + !dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS) && + !dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS) && + !dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_CONTROL) && + !dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_PORTS)) + return; + + info = skb_tunnel_info(skb); + if (!info) + return; + + key = &info->key; + + switch (ip_tunnel_info_af(info)) { + case AF_INET: + skb_flow_dissect_set_enc_addr_type(FLOW_DISSECTOR_KEY_IPV4_ADDRS, + flow_dissector, + target_container); + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS)) { + struct flow_dissector_key_ipv4_addrs *ipv4; + + ipv4 = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, + target_container); + ipv4->src = key->u.ipv4.src; + ipv4->dst = key->u.ipv4.dst; + } + break; + case AF_INET6: + skb_flow_dissect_set_enc_addr_type(FLOW_DISSECTOR_KEY_IPV6_ADDRS, + flow_dissector, + target_container); + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS)) { + struct flow_dissector_key_ipv6_addrs *ipv6; + + ipv6 = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS, + target_container); + ipv6->src = key->u.ipv6.src; + ipv6->dst = key->u.ipv6.dst; + } + break; + } + + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_KEYID)) { + struct flow_dissector_key_keyid *keyid; + + keyid = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_KEYID, + target_container); + keyid->keyid = tunnel_id_to_key32(key->tun_id); + } + + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_PORTS)) { + struct flow_dissector_key_ports *tp; + + tp = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_PORTS, + target_container); + tp->src = key->tp_src; + tp->dst = key->tp_dst; + } +} + static enum flow_dissect_ret __skb_flow_dissect_mpls(const struct sk_buff *skb, struct flow_dissector *flow_dissector, @@ -478,6 +575,9 @@ bool __skb_flow_dissect(const struct sk_buff *skb, FLOW_DISSECTOR_KEY_BASIC, target_container); + __skb_flow_dissect_tunnel_info(skb, flow_dissector, + target_container); + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ETH_ADDRS)) { struct ethhdr *eth = eth_hdr(skb); diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index 1307731ddfe4..e7e626fb87bb 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -51,7 +51,7 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt, */ preempt_disable(); rcu_read_lock(); - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); ret = bpf_prog_run_save_cb(lwt->prog, skb); rcu_read_unlock(); diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 16a1a4c4eb57..6ea3a1a7f36a 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -457,7 +457,7 @@ struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net, const void *pkey) { struct neighbour *n; - int key_len = tbl->key_len; + unsigned int key_len = tbl->key_len; u32 hash_val; struct neigh_hash_table *nht; @@ -488,7 +488,7 @@ struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, struct net_device *dev, bool want_ref) { u32 hash_val; - int key_len = tbl->key_len; + unsigned int key_len = tbl->key_len; int error; struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev); struct neigh_hash_table *nht; @@ -572,7 +572,7 @@ out_neigh_release: } EXPORT_SYMBOL(__neigh_create); -static u32 pneigh_hash(const void *pkey, int key_len) +static u32 pneigh_hash(const void *pkey, unsigned int key_len) { u32 hash_val = *(u32 *)(pkey + key_len - 4); hash_val ^= (hash_val >> 16); @@ -585,7 +585,7 @@ static u32 pneigh_hash(const void *pkey, int key_len) static struct pneigh_entry *__pneigh_lookup_1(struct pneigh_entry *n, struct net *net, const void *pkey, - int key_len, + unsigned int key_len, struct net_device *dev) { while (n) { @@ -601,7 +601,7 @@ static struct pneigh_entry *__pneigh_lookup_1(struct pneigh_entry *n, struct pneigh_entry *__pneigh_lookup(struct neigh_table *tbl, struct net *net, const void *pkey, struct net_device *dev) { - int key_len = tbl->key_len; + unsigned int key_len = tbl->key_len; u32 hash_val = pneigh_hash(pkey, key_len); return __pneigh_lookup_1(tbl->phash_buckets[hash_val], @@ -614,7 +614,7 @@ struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, struct net_device *dev, int creat) { struct pneigh_entry *n; - int key_len = tbl->key_len; + unsigned int key_len = tbl->key_len; u32 hash_val = pneigh_hash(pkey, key_len); read_lock_bh(&tbl->lock); @@ -659,7 +659,7 @@ int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey, struct net_device *dev) { struct pneigh_entry *n, **np; - int key_len = tbl->key_len; + unsigned int key_len = tbl->key_len; u32 hash_val = pneigh_hash(pkey, key_len); write_lock_bh(&tbl->lock); @@ -1662,7 +1662,7 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, if (tbl == NULL) return -EAFNOSUPPORT; - if (nla_len(dst_attr) < tbl->key_len) + if (nla_len(dst_attr) < (int)tbl->key_len) goto out; if (ndm->ndm_flags & NTF_PROXY) { @@ -1730,7 +1730,7 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, if (tbl == NULL) return -EAFNOSUPPORT; - if (nla_len(tb[NDA_DST]) < tbl->key_len) + if (nla_len(tb[NDA_DST]) < (int)tbl->key_len) goto out; dst = nla_data(tb[NDA_DST]); lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL; diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 927a6dcbad96..51d5836d8fb9 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -391,10 +391,7 @@ static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr, if (len > 0 && buf[len - 1] == '\n') --count; - if (!rtnl_trylock()) - return restart_syscall(); ret = dev_set_alias(netdev, buf, count); - rtnl_unlock(); return ret < 0 ? ret : len; } @@ -403,13 +400,12 @@ static ssize_t ifalias_show(struct device *dev, struct device_attribute *attr, char *buf) { const struct net_device *netdev = to_net_dev(dev); + char tmp[IFALIASZ]; ssize_t ret = 0; - if (!rtnl_trylock()) - return restart_syscall(); - if (netdev->ifalias) - ret = sprintf(buf, "%s\n", netdev->ifalias); - rtnl_unlock(); + ret = dev_get_alias(netdev, tmp, sizeof(tmp)); + if (ret > 0) + ret = sprintf(buf, "%s\n", tmp); return ret; } static DEVICE_ATTR_RW(ifalias); @@ -1488,7 +1484,10 @@ static void netdev_release(struct device *d) BUG_ON(dev->reg_state != NETREG_RELEASED); - kfree(dev->ifalias); + /* no need to wait for rcu grace period: + * device is dead and about to be freed. + */ + kfree(rcu_access_pointer(dev->ifalias)); netdev_freemem(dev); } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index a78fd61da0ec..e84d108cfee4 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -476,25 +476,13 @@ void rtnl_af_register(struct rtnl_af_ops *ops) EXPORT_SYMBOL_GPL(rtnl_af_register); /** - * __rtnl_af_unregister - Unregister rtnl_af_ops from rtnetlink. - * @ops: struct rtnl_af_ops * to unregister - * - * The caller must hold the rtnl_mutex. - */ -void __rtnl_af_unregister(struct rtnl_af_ops *ops) -{ - list_del(&ops->list); -} -EXPORT_SYMBOL_GPL(__rtnl_af_unregister); - -/** * rtnl_af_unregister - Unregister rtnl_af_ops from rtnetlink. * @ops: struct rtnl_af_ops * to unregister */ void rtnl_af_unregister(struct rtnl_af_ops *ops) { rtnl_lock(); - __rtnl_af_unregister(ops); + list_del(&ops->list); rtnl_unlock(); } EXPORT_SYMBOL_GPL(rtnl_af_unregister); @@ -522,11 +510,15 @@ static size_t rtnl_link_get_af_size(const struct net_device *dev, static bool rtnl_have_link_slave_info(const struct net_device *dev) { struct net_device *master_dev; + bool ret = false; - master_dev = netdev_master_upper_dev_get((struct net_device *) dev); + rcu_read_lock(); + + master_dev = netdev_master_upper_dev_get_rcu((struct net_device *)dev); if (master_dev && master_dev->rtnl_link_ops) - return true; - return false; + ret = true; + rcu_read_unlock(); + return ret; } static int rtnl_link_slave_info_fill(struct sk_buff *skb, @@ -923,6 +915,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(IFNAMSIZ) /* IFLA_PHYS_PORT_NAME */ + rtnl_xdp_size() /* IFLA_XDP */ + nla_total_size(4) /* IFLA_EVENT */ + + nla_total_size(4) /* IFLA_NEW_NETNSID */ + nla_total_size(1); /* IFLA_PROTO_DOWN */ } @@ -1211,6 +1204,36 @@ nla_put_vfinfo_failure: return -EMSGSIZE; } +static noinline_for_stack int rtnl_fill_vf(struct sk_buff *skb, + struct net_device *dev, + u32 ext_filter_mask) +{ + struct nlattr *vfinfo; + int i, num_vfs; + + if (!dev->dev.parent || ((ext_filter_mask & RTEXT_FILTER_VF) == 0)) + return 0; + + num_vfs = dev_num_vf(dev->dev.parent); + if (nla_put_u32(skb, IFLA_NUM_VF, num_vfs)) + return -EMSGSIZE; + + if (!dev->netdev_ops->ndo_get_vf_config) + return 0; + + vfinfo = nla_nest_start(skb, IFLA_VFINFO_LIST); + if (!vfinfo) + return -EMSGSIZE; + + for (i = 0; i < num_vfs; i++) { + if (rtnl_fill_vfinfo(skb, dev, i, vfinfo)) + return -EMSGSIZE; + } + + nla_nest_end(skb, vfinfo); + return 0; +} + static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev) { struct rtnl_link_ifmap map; @@ -1307,16 +1330,67 @@ static u32 rtnl_get_event(unsigned long event) return rtnl_event_type; } +static int put_master_ifindex(struct sk_buff *skb, struct net_device *dev) +{ + const struct net_device *upper_dev; + int ret = 0; + + rcu_read_lock(); + + upper_dev = netdev_master_upper_dev_get_rcu(dev); + if (upper_dev) + ret = nla_put_u32(skb, IFLA_MASTER, upper_dev->ifindex); + + rcu_read_unlock(); + return ret; +} + +static int nla_put_iflink(struct sk_buff *skb, const struct net_device *dev) +{ + int ifindex = dev_get_iflink(dev); + + if (dev->ifindex == ifindex) + return 0; + + return nla_put_u32(skb, IFLA_LINK, ifindex); +} + +static noinline_for_stack int nla_put_ifalias(struct sk_buff *skb, + struct net_device *dev) +{ + char buf[IFALIASZ]; + int ret; + + ret = dev_get_alias(dev, buf, sizeof(buf)); + return ret > 0 ? nla_put_string(skb, IFLA_IFALIAS, buf) : 0; +} + +static int rtnl_fill_link_netnsid(struct sk_buff *skb, + const struct net_device *dev) +{ + if (dev->rtnl_link_ops && dev->rtnl_link_ops->get_link_net) { + struct net *link_net = dev->rtnl_link_ops->get_link_net(dev); + + if (!net_eq(dev_net(dev), link_net)) { + int id = peernet2id_alloc(dev_net(dev), link_net); + + if (nla_put_s32(skb, IFLA_LINK_NETNSID, id)) + return -EMSGSIZE; + } + } + + return 0; +} + static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, int type, u32 pid, u32 seq, u32 change, unsigned int flags, u32 ext_filter_mask, - u32 event) + u32 event, int *new_nsid) { struct ifinfomsg *ifm; struct nlmsghdr *nlh; struct nlattr *af_spec; struct rtnl_af_ops *af_ops; - struct net_device *upper_dev = netdev_master_upper_dev_get(dev); ASSERT_RTNL(); nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifm), flags); @@ -1345,15 +1419,12 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, #ifdef CONFIG_RPS nla_put_u32(skb, IFLA_NUM_RX_QUEUES, dev->num_rx_queues) || #endif - (dev->ifindex != dev_get_iflink(dev) && - nla_put_u32(skb, IFLA_LINK, dev_get_iflink(dev))) || - (upper_dev && - nla_put_u32(skb, IFLA_MASTER, upper_dev->ifindex)) || + nla_put_iflink(skb, dev) || + put_master_ifindex(skb, dev) || nla_put_u8(skb, IFLA_CARRIER, netif_carrier_ok(dev)) || (dev->qdisc && nla_put_string(skb, IFLA_QDISC, dev->qdisc->ops->id)) || - (dev->ifalias && - nla_put_string(skb, IFLA_IFALIAS, dev->ifalias)) || + nla_put_ifalias(skb, dev) || nla_put_u32(skb, IFLA_CARRIER_CHANGES, atomic_read(&dev->carrier_changes)) || nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down)) @@ -1385,27 +1456,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, if (rtnl_fill_stats(skb, dev)) goto nla_put_failure; - if (dev->dev.parent && (ext_filter_mask & RTEXT_FILTER_VF) && - nla_put_u32(skb, IFLA_NUM_VF, dev_num_vf(dev->dev.parent))) + if (rtnl_fill_vf(skb, dev, ext_filter_mask)) goto nla_put_failure; - if (dev->netdev_ops->ndo_get_vf_config && dev->dev.parent && - ext_filter_mask & RTEXT_FILTER_VF) { - int i; - struct nlattr *vfinfo; - int num_vfs = dev_num_vf(dev->dev.parent); - - vfinfo = nla_nest_start(skb, IFLA_VFINFO_LIST); - if (!vfinfo) - goto nla_put_failure; - for (i = 0; i < num_vfs; i++) { - if (rtnl_fill_vfinfo(skb, dev, i, vfinfo)) - goto nla_put_failure; - } - - nla_nest_end(skb, vfinfo); - } - if (rtnl_port_fill(skb, dev, ext_filter_mask)) goto nla_put_failure; @@ -1417,17 +1470,12 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, goto nla_put_failure; } - if (dev->rtnl_link_ops && - dev->rtnl_link_ops->get_link_net) { - struct net *link_net = dev->rtnl_link_ops->get_link_net(dev); - - if (!net_eq(dev_net(dev), link_net)) { - int id = peernet2id_alloc(dev_net(dev), link_net); + if (rtnl_fill_link_netnsid(skb, dev)) + goto nla_put_failure; - if (nla_put_s32(skb, IFLA_LINK_NETNSID, id)) - goto nla_put_failure; - } - } + if (new_nsid && + nla_put_s32(skb, IFLA_NEW_NETNSID, *new_nsid) < 0) + goto nla_put_failure; if (!(af_spec = nla_nest_start(skb, IFLA_AF_SPEC))) goto nla_put_failure; @@ -1658,7 +1706,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, 0, flags, - ext_filter_mask, 0); + ext_filter_mask, 0, NULL); if (err < 0) { if (likely(skb->len)) @@ -1909,7 +1957,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) return err; } -static int do_set_master(struct net_device *dev, int ifindex) +static int do_set_master(struct net_device *dev, int ifindex, + struct netlink_ext_ack *extack) { struct net_device *upper_dev = netdev_master_upper_dev_get(dev); const struct net_device_ops *ops; @@ -1934,7 +1983,7 @@ static int do_set_master(struct net_device *dev, int ifindex) return -EINVAL; ops = upper_dev->netdev_ops; if (ops->ndo_add_slave) { - err = ops->ndo_add_slave(upper_dev, dev); + err = ops->ndo_add_slave(upper_dev, dev, extack); if (err) return err; } else { @@ -2067,7 +2116,7 @@ static int do_setlink(const struct sk_buff *skb, } if (tb[IFLA_MASTER]) { - err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER])); + err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), extack); if (err) goto errout; status |= DO_SETLINK_MODIFIED; @@ -2576,12 +2625,6 @@ replay: return err; slave_data = slave_attr; } - if (m_ops->slave_validate) { - err = m_ops->slave_validate(tb, slave_data, - extack); - if (err < 0) - return err; - } } if (dev) { @@ -2711,7 +2754,8 @@ replay: goto out_unregister; } if (tb[IFLA_MASTER]) { - err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER])); + err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), + extack); if (err) goto out_unregister; } @@ -2771,7 +2815,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh, return -ENOBUFS; err = rtnl_fill_ifinfo(nskb, dev, RTM_NEWLINK, NETLINK_CB(skb).portid, - nlh->nlmsg_seq, 0, 0, ext_filter_mask, 0); + nlh->nlmsg_seq, 0, 0, ext_filter_mask, 0, NULL); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_size */ WARN_ON(err == -EMSGSIZE); @@ -2856,7 +2900,7 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, unsigned int change, - u32 event, gfp_t flags) + u32 event, gfp_t flags, int *new_nsid) { struct net *net = dev_net(dev); struct sk_buff *skb; @@ -2867,7 +2911,8 @@ struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, if (skb == NULL) goto errout; - err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0, 0, event); + err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0, 0, event, + new_nsid); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); @@ -2890,14 +2935,14 @@ void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags) static void rtmsg_ifinfo_event(int type, struct net_device *dev, unsigned int change, u32 event, - gfp_t flags) + gfp_t flags, int *new_nsid) { struct sk_buff *skb; if (dev->reg_state != NETREG_REGISTERED) return; - skb = rtmsg_ifinfo_build_skb(type, dev, change, event, flags); + skb = rtmsg_ifinfo_build_skb(type, dev, change, event, flags, new_nsid); if (skb) rtmsg_ifinfo_send(skb, dev, flags); } @@ -2905,10 +2950,17 @@ static void rtmsg_ifinfo_event(int type, struct net_device *dev, void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change, gfp_t flags) { - rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags); + rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags, NULL); } EXPORT_SYMBOL(rtmsg_ifinfo); +void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change, + gfp_t flags, int *new_nsid) +{ + rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags, + new_nsid); +} + static int nlmsg_populate_fdb_fill(struct sk_buff *skb, struct net_device *dev, u8 *addr, u16 vid, u32 pid, u32 seq, @@ -3854,6 +3906,9 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, return -EMSGSIZE; ifsm = nlmsg_data(nlh); + ifsm->family = PF_UNSPEC; + ifsm->pad1 = 0; + ifsm->pad2 = 0; ifsm->ifindex = dev->ifindex; ifsm->filter_mask = filter_mask; @@ -4284,7 +4339,7 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi case NETDEV_RESEND_IGMP: case NETDEV_CHANGEINFODATA: rtmsg_ifinfo_event(RTM_NEWLINK, dev, 0, rtnl_get_event(event), - GFP_KERNEL); + GFP_KERNEL, NULL); break; default: break; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 16982de649b9..822a90e56aea 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1509,6 +1509,8 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, skb->nohdr = 0; atomic_set(&skb_shinfo(skb)->dataref, 1); + skb_metadata_clear(skb); + /* It is not generally safe to change skb->truesize. * For the moment, we really care of rx path, or * when skb is orphaned (not attached to a socket). @@ -2848,12 +2850,15 @@ EXPORT_SYMBOL(skb_queue_purge); */ void skb_rbtree_purge(struct rb_root *root) { - struct sk_buff *skb, *next; + struct rb_node *p = rb_first(root); - rbtree_postorder_for_each_entry_safe(skb, next, root, rbnode) - kfree_skb(skb); + while (p) { + struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode); - *root = RB_ROOT; + p = rb_next(p); + rb_erase(&skb->rbnode, root); + kfree_skb(skb); + } } /** @@ -4762,6 +4767,7 @@ EXPORT_SYMBOL(kfree_skb_partial); bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, bool *fragstolen, int *delta_truesize) { + struct skb_shared_info *to_shinfo, *from_shinfo; int i, delta, len = from->len; *fragstolen = false; @@ -4776,7 +4782,9 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, return true; } - if (skb_has_frag_list(to) || skb_has_frag_list(from)) + to_shinfo = skb_shinfo(to); + from_shinfo = skb_shinfo(from); + if (to_shinfo->frag_list || from_shinfo->frag_list) return false; if (skb_zcopy(to) || skb_zcopy(from)) return false; @@ -4785,8 +4793,8 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, struct page *page; unsigned int offset; - if (skb_shinfo(to)->nr_frags + - skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS) + if (to_shinfo->nr_frags + + from_shinfo->nr_frags >= MAX_SKB_FRAGS) return false; if (skb_head_is_locked(from)) @@ -4797,12 +4805,12 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, page = virt_to_head_page(from->head); offset = from->data - (unsigned char *)page_address(page); - skb_fill_page_desc(to, skb_shinfo(to)->nr_frags, + skb_fill_page_desc(to, to_shinfo->nr_frags, page, offset, skb_headlen(from)); *fragstolen = true; } else { - if (skb_shinfo(to)->nr_frags + - skb_shinfo(from)->nr_frags > MAX_SKB_FRAGS) + if (to_shinfo->nr_frags + + from_shinfo->nr_frags > MAX_SKB_FRAGS) return false; delta = from->truesize - SKB_TRUESIZE(skb_end_offset(from)); @@ -4810,19 +4818,19 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, WARN_ON_ONCE(delta < len); - memcpy(skb_shinfo(to)->frags + skb_shinfo(to)->nr_frags, - skb_shinfo(from)->frags, - skb_shinfo(from)->nr_frags * sizeof(skb_frag_t)); - skb_shinfo(to)->nr_frags += skb_shinfo(from)->nr_frags; + memcpy(to_shinfo->frags + to_shinfo->nr_frags, + from_shinfo->frags, + from_shinfo->nr_frags * sizeof(skb_frag_t)); + to_shinfo->nr_frags += from_shinfo->nr_frags; if (!skb_cloned(from)) - skb_shinfo(from)->nr_frags = 0; + from_shinfo->nr_frags = 0; /* if the skb is not cloned this does nothing * since we set nr_frags to 0. */ - for (i = 0; i < skb_shinfo(from)->nr_frags; i++) - skb_frag_ref(from, i); + for (i = 0; i < from_shinfo->nr_frags; i++) + __skb_frag_ref(&from_shinfo->frags[i]); to->truesize += delta; to->len += len; diff --git a/net/core/sock.c b/net/core/sock.c index 9b7b6bbb2a23..23953b741a41 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1654,6 +1654,8 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) sock_copy(newsk, sk); + newsk->sk_prot_creator = sk->sk_prot; + /* SANITY */ if (likely(newsk->sk_net_refcnt)) get_net(sock_net(newsk)); @@ -1682,13 +1684,16 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) sock_reset_flag(newsk, SOCK_DONE); - filter = rcu_dereference_protected(newsk->sk_filter, 1); + rcu_read_lock(); + filter = rcu_dereference(sk->sk_filter); if (filter != NULL) /* though it's an empty new sock, the charging may fail * if sysctl_optmem_max was changed between creation of * original socket and cloning */ is_charged = sk_filter_charge(newsk, filter); + RCU_INIT_POINTER(newsk->sk_filter, filter); + rcu_read_unlock(); if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { /* We need to make sure that we don't uncharge the new diff --git a/net/dsa/Makefile b/net/dsa/Makefile index fcce25da937c..2e7ac8bab19d 100644 --- a/net/dsa/Makefile +++ b/net/dsa/Makefile @@ -1,6 +1,6 @@ # the core obj-$(CONFIG_NET_DSA) += dsa_core.o -dsa_core-y += dsa.o dsa2.o legacy.o port.o slave.o switch.o +dsa_core-y += dsa.o dsa2.o legacy.o master.o port.o slave.o switch.o # tagging formats dsa_core-$(CONFIG_NET_DSA_TAG_BRCM) += tag_brcm.o diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 03c58b0eb082..51ca2a524a27 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -112,34 +112,6 @@ const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol) return ops; } -int dsa_cpu_port_ethtool_setup(struct dsa_port *cpu_dp) -{ - struct dsa_switch *ds = cpu_dp->ds; - struct net_device *master; - struct ethtool_ops *cpu_ops; - - master = cpu_dp->netdev; - - cpu_ops = devm_kzalloc(ds->dev, sizeof(*cpu_ops), GFP_KERNEL); - if (!cpu_ops) - return -ENOMEM; - - memcpy(&cpu_dp->ethtool_ops, master->ethtool_ops, - sizeof(struct ethtool_ops)); - cpu_dp->orig_ethtool_ops = master->ethtool_ops; - memcpy(cpu_ops, &cpu_dp->ethtool_ops, - sizeof(struct ethtool_ops)); - dsa_cpu_port_ethtool_init(cpu_ops); - master->ethtool_ops = cpu_ops; - - return 0; -} - -void dsa_cpu_port_ethtool_restore(struct dsa_port *cpu_dp) -{ - cpu_dp->netdev->ethtool_ops = cpu_dp->orig_ethtool_ops; -} - void dsa_cpu_dsa_destroy(struct dsa_port *port) { struct device_node *port_dn = port->dn; @@ -188,12 +160,12 @@ EXPORT_SYMBOL_GPL(dsa_dev_to_net_device); static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *unused) { - struct dsa_switch_tree *dst = dev->dsa_ptr; + struct dsa_port *cpu_dp = dev->dsa_ptr; struct sk_buff *nskb = NULL; struct pcpu_sw_netstats *s; struct dsa_slave_priv *p; - if (unlikely(dst == NULL)) { + if (unlikely(!cpu_dp)) { kfree_skb(skb); return 0; } @@ -202,7 +174,7 @@ static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev, if (!skb) return 0; - nskb = dst->rcv(skb, dev, pt); + nskb = cpu_dp->rcv(skb, dev, pt); if (!nskb) { kfree_skb(skb); return 0; diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 873af0108e24..54ed054777bd 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -433,18 +433,17 @@ static int dsa_dst_apply(struct dsa_switch_tree *dst) return err; } - if (dst->cpu_dp) { - err = dsa_cpu_port_ethtool_setup(dst->cpu_dp); - if (err) - return err; - } - /* If we use a tagging format that doesn't have an ethertype * field, make sure that all packets from this point on get * sent to the tag format's receive function. */ wmb(); - dst->cpu_dp->netdev->dsa_ptr = dst; + dst->cpu_dp->netdev->dsa_ptr = dst->cpu_dp; + + err = dsa_master_ethtool_setup(dst->cpu_dp->netdev); + if (err) + return err; + dst->applied = true; return 0; @@ -458,6 +457,8 @@ static void dsa_dst_unapply(struct dsa_switch_tree *dst) if (!dst->applied) return; + dsa_master_ethtool_restore(dst->cpu_dp->netdev); + dst->cpu_dp->netdev->dsa_ptr = NULL; /* If we used a tagging format that doesn't have an ethertype @@ -474,10 +475,7 @@ static void dsa_dst_unapply(struct dsa_switch_tree *dst) dsa_ds_unapply(dst, ds); } - if (dst->cpu_dp) { - dsa_cpu_port_ethtool_restore(dst->cpu_dp); - dst->cpu_dp = NULL; - } + dst->cpu_dp = NULL; pr_info("DSA: tree %d unapplied\n", dst->tree); dst->applied = false; @@ -487,6 +485,7 @@ static int dsa_cpu_parse(struct dsa_port *port, u32 index, struct dsa_switch_tree *dst, struct dsa_switch *ds) { + const struct dsa_device_ops *tag_ops; enum dsa_tag_protocol tag_protocol; struct net_device *ethernet_dev; struct device_node *ethernet; @@ -516,14 +515,18 @@ static int dsa_cpu_parse(struct dsa_port *port, u32 index, ds->cpu_port_mask |= BIT(index); tag_protocol = ds->ops->get_tag_protocol(ds); - dst->tag_ops = dsa_resolve_tag_protocol(tag_protocol); - if (IS_ERR(dst->tag_ops)) { + tag_ops = dsa_resolve_tag_protocol(tag_protocol); + if (IS_ERR(tag_ops)) { dev_warn(ds->dev, "No tagger for this switch\n"); ds->cpu_port_mask &= ~BIT(index); - return PTR_ERR(dst->tag_ops); + return PTR_ERR(tag_ops); } - dst->rcv = dst->tag_ops->rcv; + dst->cpu_dp->tag_ops = tag_ops; + + /* Make a few copies for faster access in master receive hot path */ + dst->cpu_dp->rcv = dst->cpu_dp->tag_ops->rcv; + dst->cpu_dp->dst = dst; return 0; } diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 9c3eeb72462d..2850077cc9cc 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -66,7 +66,7 @@ struct dsa_notifier_vlan_info { }; struct dsa_slave_priv { - /* Copy of dp->ds->dst->tag_ops->xmit for faster access in hot path */ + /* Copy of CPU port xmit for faster access in slave transmit hot path */ struct sk_buff * (*xmit)(struct sk_buff *skb, struct net_device *dev); @@ -79,7 +79,6 @@ struct dsa_slave_priv { * The phylib phy_device pointer for the PHY connected * to this port. */ - struct phy_device *phy; phy_interface_t phy_interface; int old_link; int old_pause; @@ -97,8 +96,6 @@ struct dsa_slave_priv { int dsa_cpu_dsa_setup(struct dsa_port *port); void dsa_cpu_dsa_destroy(struct dsa_port *dport); const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol); -int dsa_cpu_port_ethtool_setup(struct dsa_port *cpu_dp); -void dsa_cpu_port_ethtool_restore(struct dsa_port *cpu_dp); bool dsa_schedule_work(struct work_struct *work); /* legacy.c */ @@ -112,10 +109,35 @@ int dsa_legacy_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid); +/* master.c */ +int dsa_master_ethtool_setup(struct net_device *dev); +void dsa_master_ethtool_restore(struct net_device *dev); + +static inline struct net_device *dsa_master_get_slave(struct net_device *dev, + int device, int port) +{ + struct dsa_port *cpu_dp = dev->dsa_ptr; + struct dsa_switch_tree *dst = cpu_dp->dst; + struct dsa_switch *ds; + + if (device < 0 || device >= DSA_MAX_SWITCHES) + return NULL; + + ds = dst->ds[device]; + if (!ds) + return NULL; + + if (port < 0 || port >= ds->num_ports) + return NULL; + + return ds->ports[port].netdev; +} + /* port.c */ int dsa_port_set_state(struct dsa_port *dp, u8 state, struct switchdev_trans *trans); -void dsa_port_set_state_now(struct dsa_port *dp, u8 state); +int dsa_port_enable(struct dsa_port *dp, struct phy_device *phy); +void dsa_port_disable(struct dsa_port *dp, struct phy_device *phy); int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br); void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br); int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering, @@ -126,6 +148,7 @@ int dsa_port_fdb_add(struct dsa_port *dp, const unsigned char *addr, u16 vid); int dsa_port_fdb_del(struct dsa_port *dp, const unsigned char *addr, u16 vid); +int dsa_port_fdb_dump(struct dsa_port *dp, dsa_fdb_dump_cb_t *cb, void *data); int dsa_port_mdb_add(struct dsa_port *dp, const struct switchdev_obj_port_mdb *mdb, struct switchdev_trans *trans); @@ -139,7 +162,6 @@ int dsa_port_vlan_del(struct dsa_port *dp, /* slave.c */ extern const struct dsa_device_ops notag_netdev_ops; void dsa_slave_mii_bus_init(struct dsa_switch *ds); -void dsa_cpu_port_ethtool_init(struct ethtool_ops *ops); int dsa_slave_create(struct dsa_port *port, const char *name); void dsa_slave_destroy(struct net_device *slave_dev); int dsa_slave_suspend(struct net_device *slave_dev); @@ -180,9 +202,4 @@ static inline struct net_device *dsa_master_netdev(struct dsa_slave_priv *p) return p->dp->cpu_dp->netdev; } -static inline struct dsa_port *dsa_get_cpu_port(struct dsa_switch_tree *dst) -{ - return dst->cpu_dp; -} - #endif diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c index 91e6f7981d39..19ff6e0a21dc 100644 --- a/net/dsa/legacy.c +++ b/net/dsa/legacy.c @@ -144,14 +144,19 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, * switch. */ if (dst->cpu_dp->ds == ds) { + const struct dsa_device_ops *tag_ops; enum dsa_tag_protocol tag_protocol; tag_protocol = ops->get_tag_protocol(ds); - dst->tag_ops = dsa_resolve_tag_protocol(tag_protocol); - if (IS_ERR(dst->tag_ops)) - return PTR_ERR(dst->tag_ops); + tag_ops = dsa_resolve_tag_protocol(tag_protocol); + if (IS_ERR(tag_ops)) + return PTR_ERR(tag_ops); - dst->rcv = dst->tag_ops->rcv; + dst->cpu_dp->tag_ops = tag_ops; + + /* Few copies for faster access in master receive hot path */ + dst->cpu_dp->rcv = dst->cpu_dp->tag_ops->rcv; + dst->cpu_dp->dst = dst; } memcpy(ds->rtable, cd->rtable, sizeof(ds->rtable)); @@ -206,10 +211,6 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, netdev_err(master, "[%d] : can't configure CPU and DSA ports\n", index); - ret = dsa_cpu_port_ethtool_setup(ds->dst->cpu_dp); - if (ret) - return ret; - return 0; } @@ -604,9 +605,9 @@ static int dsa_setup_dst(struct dsa_switch_tree *dst, struct net_device *dev, * sent to the tag format's receive function. */ wmb(); - dev->dsa_ptr = dst; + dev->dsa_ptr = dst->cpu_dp; - return 0; + return dsa_master_ethtool_setup(dst->cpu_dp->netdev); } static int dsa_probe(struct platform_device *pdev) @@ -671,6 +672,8 @@ static void dsa_remove_dst(struct dsa_switch_tree *dst) { int i; + dsa_master_ethtool_restore(dst->cpu_dp->netdev); + dst->cpu_dp->netdev->dsa_ptr = NULL; /* If we used a tagging format that doesn't have an ethertype @@ -686,8 +689,6 @@ static void dsa_remove_dst(struct dsa_switch_tree *dst) dsa_switch_destroy(ds); } - dsa_cpu_port_ethtool_restore(dst->cpu_dp); - dev_put(dst->cpu_dp->netdev); } diff --git a/net/dsa/master.c b/net/dsa/master.c new file mode 100644 index 000000000000..5f3f57e372e0 --- /dev/null +++ b/net/dsa/master.c @@ -0,0 +1,117 @@ +/* + * Handling of a master device, switching frames via its switch fabric CPU port + * + * Copyright (c) 2017 Savoir-faire Linux Inc. + * Vivien Didelot <vivien.didelot@savoirfairelinux.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include "dsa_priv.h" + +static void dsa_master_get_ethtool_stats(struct net_device *dev, + struct ethtool_stats *stats, + uint64_t *data) +{ + struct dsa_port *cpu_dp = dev->dsa_ptr; + const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops; + struct dsa_switch *ds = cpu_dp->ds; + int port = cpu_dp->index; + int count = 0; + + if (ops && ops->get_sset_count && ops->get_ethtool_stats) { + count = ops->get_sset_count(dev, ETH_SS_STATS); + ops->get_ethtool_stats(dev, stats, data); + } + + if (ds->ops->get_ethtool_stats) + ds->ops->get_ethtool_stats(ds, port, data + count); +} + +static int dsa_master_get_sset_count(struct net_device *dev, int sset) +{ + struct dsa_port *cpu_dp = dev->dsa_ptr; + const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops; + struct dsa_switch *ds = cpu_dp->ds; + int count = 0; + + if (ops && ops->get_sset_count) + count += ops->get_sset_count(dev, sset); + + if (sset == ETH_SS_STATS && ds->ops->get_sset_count) + count += ds->ops->get_sset_count(ds); + + return count; +} + +static void dsa_master_get_strings(struct net_device *dev, uint32_t stringset, + uint8_t *data) +{ + struct dsa_port *cpu_dp = dev->dsa_ptr; + const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops; + struct dsa_switch *ds = cpu_dp->ds; + int port = cpu_dp->index; + int len = ETH_GSTRING_LEN; + int mcount = 0, count; + unsigned int i; + uint8_t pfx[4]; + uint8_t *ndata; + + snprintf(pfx, sizeof(pfx), "p%.2d", port); + /* We do not want to be NULL-terminated, since this is a prefix */ + pfx[sizeof(pfx) - 1] = '_'; + + if (ops && ops->get_sset_count && ops->get_strings) { + mcount = ops->get_sset_count(dev, ETH_SS_STATS); + ops->get_strings(dev, stringset, data); + } + + if (stringset == ETH_SS_STATS && ds->ops->get_strings) { + ndata = data + mcount * len; + /* This function copies ETH_GSTRINGS_LEN bytes, we will mangle + * the output after to prepend our CPU port prefix we + * constructed earlier + */ + ds->ops->get_strings(ds, port, ndata); + count = ds->ops->get_sset_count(ds); + for (i = 0; i < count; i++) { + memmove(ndata + (i * len + sizeof(pfx)), + ndata + i * len, len - sizeof(pfx)); + memcpy(ndata + i * len, pfx, sizeof(pfx)); + } + } +} + +int dsa_master_ethtool_setup(struct net_device *dev) +{ + struct dsa_port *cpu_dp = dev->dsa_ptr; + struct dsa_switch *ds = cpu_dp->ds; + struct ethtool_ops *ops; + + ops = devm_kzalloc(ds->dev, sizeof(*ops), GFP_KERNEL); + if (!ops) + return -ENOMEM; + + cpu_dp->orig_ethtool_ops = dev->ethtool_ops; + if (cpu_dp->orig_ethtool_ops) + memcpy(ops, cpu_dp->orig_ethtool_ops, sizeof(*ops)); + + ops->get_sset_count = dsa_master_get_sset_count; + ops->get_ethtool_stats = dsa_master_get_ethtool_stats; + ops->get_strings = dsa_master_get_strings; + + dev->ethtool_ops = ops; + + return 0; +} + +void dsa_master_ethtool_restore(struct net_device *dev) +{ + struct dsa_port *cpu_dp = dev->dsa_ptr; + + dev->ethtool_ops = cpu_dp->orig_ethtool_ops; + cpu_dp->orig_ethtool_ops = NULL; +} diff --git a/net/dsa/port.c b/net/dsa/port.c index 659676ba3f8b..72c8dbd3d3f2 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -56,7 +56,7 @@ int dsa_port_set_state(struct dsa_port *dp, u8 state, return 0; } -void dsa_port_set_state_now(struct dsa_port *dp, u8 state) +static void dsa_port_set_state_now(struct dsa_port *dp, u8 state) { int err; @@ -65,6 +65,35 @@ void dsa_port_set_state_now(struct dsa_port *dp, u8 state) pr_err("DSA: failed to set STP state %u (%d)\n", state, err); } +int dsa_port_enable(struct dsa_port *dp, struct phy_device *phy) +{ + u8 stp_state = dp->bridge_dev ? BR_STATE_BLOCKING : BR_STATE_FORWARDING; + struct dsa_switch *ds = dp->ds; + int port = dp->index; + int err; + + if (ds->ops->port_enable) { + err = ds->ops->port_enable(ds, port, phy); + if (err) + return err; + } + + dsa_port_set_state_now(dp, stp_state); + + return 0; +} + +void dsa_port_disable(struct dsa_port *dp, struct phy_device *phy) +{ + struct dsa_switch *ds = dp->ds; + int port = dp->index; + + dsa_port_set_state_now(dp, BR_STATE_DISABLED); + + if (ds->ops->port_disable) + ds->ops->port_disable(ds, port, phy); +} + int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br) { struct dsa_notifier_bridge_info info = { @@ -173,6 +202,17 @@ int dsa_port_fdb_del(struct dsa_port *dp, const unsigned char *addr, return dsa_port_notify(dp, DSA_NOTIFIER_FDB_DEL, &info); } +int dsa_port_fdb_dump(struct dsa_port *dp, dsa_fdb_dump_cb_t *cb, void *data) +{ + struct dsa_switch *ds = dp->ds; + int port = dp->index; + + if (!ds->ops->port_fdb_dump) + return -EOPNOTSUPP; + + return ds->ops->port_fdb_dump(ds, port, cb, data); +} + int dsa_port_mdb_add(struct dsa_port *dp, const struct switchdev_obj_port_mdb *mdb, struct switchdev_trans *trans) diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 2afa99506f8b..fb2954ff198c 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -73,9 +73,7 @@ static int dsa_slave_open(struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_port *dp = p->dp; - struct dsa_switch *ds = dp->ds; struct net_device *master = dsa_master_netdev(p); - u8 stp_state = dp->bridge_dev ? BR_STATE_BLOCKING : BR_STATE_FORWARDING; int err; if (!(master->flags & IFF_UP)) @@ -98,16 +96,12 @@ static int dsa_slave_open(struct net_device *dev) goto clear_allmulti; } - if (ds->ops->port_enable) { - err = ds->ops->port_enable(ds, p->dp->index, p->phy); - if (err) - goto clear_promisc; - } - - dsa_port_set_state_now(p->dp, stp_state); + err = dsa_port_enable(dp, dev->phydev); + if (err) + goto clear_promisc; - if (p->phy) - phy_start(p->phy); + if (dev->phydev) + phy_start(dev->phydev); return 0; @@ -128,10 +122,12 @@ static int dsa_slave_close(struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); struct net_device *master = dsa_master_netdev(p); - struct dsa_switch *ds = p->dp->ds; + struct dsa_port *dp = p->dp; + + if (dev->phydev) + phy_stop(dev->phydev); - if (p->phy) - phy_stop(p->phy); + dsa_port_disable(dp, dev->phydev); dev_mc_unsync(master, dev); dev_uc_unsync(master, dev); @@ -143,11 +139,6 @@ static int dsa_slave_close(struct net_device *dev) if (!ether_addr_equal(dev->dev_addr, master->dev_addr)) dev_uc_del(master, dev->dev_addr); - if (ds->ops->port_disable) - ds->ops->port_disable(ds, p->dp->index, p->phy); - - dsa_port_set_state_now(p->dp, BR_STATE_DISABLED); - return 0; } @@ -263,27 +254,20 @@ dsa_slave_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, }; struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_port *dp = p->dp; - struct dsa_switch *ds = dp->ds; int err; - if (!ds->ops->port_fdb_dump) - return -EOPNOTSUPP; - - err = ds->ops->port_fdb_dump(ds, dp->index, - dsa_slave_port_fdb_do_dump, - &dump); + err = dsa_port_fdb_dump(dp, dsa_slave_port_fdb_do_dump, &dump); *idx = dump.idx; + return err; } static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { - struct dsa_slave_priv *p = netdev_priv(dev); - - if (p->phy != NULL) - return phy_mii_ioctl(p->phy, ifr, cmd); + if (!dev->phydev) + return -ENODEV; - return -EOPNOTSUPP; + return phy_mii_ioctl(dev->phydev, ifr, cmd); } static int dsa_slave_port_attr_set(struct net_device *dev, @@ -385,10 +369,12 @@ static int dsa_slave_port_attr_get(struct net_device *dev, return 0; } -static inline netdev_tx_t dsa_netpoll_send_skb(struct dsa_slave_priv *p, - struct sk_buff *skb) +static inline netdev_tx_t dsa_slave_netpoll_send_skb(struct net_device *dev, + struct sk_buff *skb) { #ifdef CONFIG_NET_POLL_CONTROLLER + struct dsa_slave_priv *p = netdev_priv(dev); + if (p->netpoll) netpoll_send_skb(p->netpoll, skb); #else @@ -422,7 +408,7 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev) * tag to be successfully transmitted */ if (unlikely(netpoll_tx_running(dev))) - return dsa_netpoll_send_skb(p, nskb); + return dsa_slave_netpoll_send_skb(dev, nskb); /* Queue the SKB for transmission on the parent interface, but * do not modify its EtherType @@ -434,31 +420,6 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev) } /* ethtool operations *******************************************************/ -static int -dsa_slave_get_link_ksettings(struct net_device *dev, - struct ethtool_link_ksettings *cmd) -{ - struct dsa_slave_priv *p = netdev_priv(dev); - - if (!p->phy) - return -EOPNOTSUPP; - - phy_ethtool_ksettings_get(p->phy, cmd); - - return 0; -} - -static int -dsa_slave_set_link_ksettings(struct net_device *dev, - const struct ethtool_link_ksettings *cmd) -{ - struct dsa_slave_priv *p = netdev_priv(dev); - - if (p->phy != NULL) - return phy_ethtool_ksettings_set(p->phy, cmd); - - return -EOPNOTSUPP; -} static void dsa_slave_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *drvinfo) @@ -489,26 +450,14 @@ dsa_slave_get_regs(struct net_device *dev, struct ethtool_regs *regs, void *_p) ds->ops->get_regs(ds, p->dp->index, regs, _p); } -static int dsa_slave_nway_reset(struct net_device *dev) -{ - struct dsa_slave_priv *p = netdev_priv(dev); - - if (p->phy != NULL) - return genphy_restart_aneg(p->phy); - - return -EOPNOTSUPP; -} - static u32 dsa_slave_get_link(struct net_device *dev) { - struct dsa_slave_priv *p = netdev_priv(dev); + if (!dev->phydev) + return -ENODEV; - if (p->phy != NULL) { - genphy_update_link(p->phy); - return p->phy->link; - } + genphy_update_link(dev->phydev); - return -EOPNOTSUPP; + return dev->phydev->link; } static int dsa_slave_get_eeprom_len(struct net_device *dev) @@ -567,79 +516,6 @@ static void dsa_slave_get_strings(struct net_device *dev, } } -static void dsa_cpu_port_get_ethtool_stats(struct net_device *dev, - struct ethtool_stats *stats, - uint64_t *data) -{ - struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_port *cpu_dp = dsa_get_cpu_port(dst); - struct dsa_switch *ds = cpu_dp->ds; - s8 cpu_port = cpu_dp->index; - int count = 0; - - if (cpu_dp->ethtool_ops.get_sset_count) { - count = cpu_dp->ethtool_ops.get_sset_count(dev, ETH_SS_STATS); - cpu_dp->ethtool_ops.get_ethtool_stats(dev, stats, data); - } - - if (ds->ops->get_ethtool_stats) - ds->ops->get_ethtool_stats(ds, cpu_port, data + count); -} - -static int dsa_cpu_port_get_sset_count(struct net_device *dev, int sset) -{ - struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_port *cpu_dp = dsa_get_cpu_port(dst); - struct dsa_switch *ds = cpu_dp->ds; - int count = 0; - - if (cpu_dp->ethtool_ops.get_sset_count) - count += cpu_dp->ethtool_ops.get_sset_count(dev, sset); - - if (sset == ETH_SS_STATS && ds->ops->get_sset_count) - count += ds->ops->get_sset_count(ds); - - return count; -} - -static void dsa_cpu_port_get_strings(struct net_device *dev, - uint32_t stringset, uint8_t *data) -{ - struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_port *cpu_dp = dsa_get_cpu_port(dst); - struct dsa_switch *ds = cpu_dp->ds; - s8 cpu_port = cpu_dp->index; - int len = ETH_GSTRING_LEN; - int mcount = 0, count; - unsigned int i; - uint8_t pfx[4]; - uint8_t *ndata; - - snprintf(pfx, sizeof(pfx), "p%.2d", cpu_port); - /* We do not want to be NULL-terminated, since this is a prefix */ - pfx[sizeof(pfx) - 1] = '_'; - - if (cpu_dp->ethtool_ops.get_sset_count) { - mcount = cpu_dp->ethtool_ops.get_sset_count(dev, ETH_SS_STATS); - cpu_dp->ethtool_ops.get_strings(dev, stringset, data); - } - - if (stringset == ETH_SS_STATS && ds->ops->get_strings) { - ndata = data + mcount * len; - /* This function copies ETH_GSTRINGS_LEN bytes, we will mangle - * the output after to prepend our CPU port prefix we - * constructed earlier - */ - ds->ops->get_strings(ds, cpu_port, ndata); - count = ds->ops->get_sset_count(ds); - for (i = 0; i < count; i++) { - memmove(ndata + (i * len + sizeof(pfx)), - ndata + i * len, len - sizeof(pfx)); - memcpy(ndata + i * len, pfx, sizeof(pfx)); - } - } -} - static void dsa_slave_get_ethtool_stats(struct net_device *dev, struct ethtool_stats *stats, uint64_t *data) @@ -716,7 +592,7 @@ static int dsa_slave_set_eee(struct net_device *dev, struct ethtool_eee *e) int ret; /* Port's PHY and MAC both need to be EEE capable */ - if (!p->phy) + if (!dev->phydev) return -ENODEV; if (!ds->ops->set_mac_eee) @@ -727,12 +603,12 @@ static int dsa_slave_set_eee(struct net_device *dev, struct ethtool_eee *e) return ret; if (e->eee_enabled) { - ret = phy_init_eee(p->phy, 0); + ret = phy_init_eee(dev->phydev, 0); if (ret) return ret; } - return phy_ethtool_set_eee(p->phy, e); + return phy_ethtool_set_eee(dev->phydev, e); } static int dsa_slave_get_eee(struct net_device *dev, struct ethtool_eee *e) @@ -742,7 +618,7 @@ static int dsa_slave_get_eee(struct net_device *dev, struct ethtool_eee *e) int ret; /* Port's PHY and MAC both need to be EEE capable */ - if (!p->phy) + if (!dev->phydev) return -ENODEV; if (!ds->ops->get_mac_eee) @@ -752,7 +628,7 @@ static int dsa_slave_get_eee(struct net_device *dev, struct ethtool_eee *e) if (ret) return ret; - return phy_ethtool_get_eee(p->phy, e); + return phy_ethtool_get_eee(dev->phydev, e); } #ifdef CONFIG_NET_POLL_CONTROLLER @@ -809,9 +685,9 @@ static int dsa_slave_get_phys_port_name(struct net_device *dev, } static struct dsa_mall_tc_entry * -dsa_slave_mall_tc_entry_find(struct dsa_slave_priv *p, - unsigned long cookie) +dsa_slave_mall_tc_entry_find(struct net_device *dev, unsigned long cookie) { + struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_mall_tc_entry *mall_tc_entry; list_for_each_entry(mall_tc_entry, &p->mall_tc_list, list) @@ -893,7 +769,7 @@ static void dsa_slave_del_cls_matchall(struct net_device *dev, if (!ds->ops->port_mirror_del) return; - mall_tc_entry = dsa_slave_mall_tc_entry_find(p, cls->cookie); + mall_tc_entry = dsa_slave_mall_tc_entry_find(dev, cls->cookie); if (!mall_tc_entry) return; @@ -976,13 +852,6 @@ static void dsa_slave_get_stats64(struct net_device *dev, } } -void dsa_cpu_port_ethtool_init(struct ethtool_ops *ops) -{ - ops->get_sset_count = dsa_cpu_port_get_sset_count; - ops->get_ethtool_stats = dsa_cpu_port_get_ethtool_stats; - ops->get_strings = dsa_cpu_port_get_strings; -} - static int dsa_slave_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *nfc, u32 *rule_locs) { @@ -1011,7 +880,7 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = { .get_drvinfo = dsa_slave_get_drvinfo, .get_regs_len = dsa_slave_get_regs_len, .get_regs = dsa_slave_get_regs, - .nway_reset = dsa_slave_nway_reset, + .nway_reset = phy_ethtool_nway_reset, .get_link = dsa_slave_get_link, .get_eeprom_len = dsa_slave_get_eeprom_len, .get_eeprom = dsa_slave_get_eeprom, @@ -1023,8 +892,8 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = { .get_wol = dsa_slave_get_wol, .set_eee = dsa_slave_set_eee, .get_eee = dsa_slave_get_eee, - .get_link_ksettings = dsa_slave_get_link_ksettings, - .set_link_ksettings = dsa_slave_set_link_ksettings, + .get_link_ksettings = phy_ethtool_get_link_ksettings, + .set_link_ksettings = phy_ethtool_set_link_ksettings, .get_rxnfc = dsa_slave_get_rxnfc, .set_rxnfc = dsa_slave_set_rxnfc, }; @@ -1068,26 +937,26 @@ static void dsa_slave_adjust_link(struct net_device *dev) struct dsa_switch *ds = p->dp->ds; unsigned int status_changed = 0; - if (p->old_link != p->phy->link) { + if (p->old_link != dev->phydev->link) { status_changed = 1; - p->old_link = p->phy->link; + p->old_link = dev->phydev->link; } - if (p->old_duplex != p->phy->duplex) { + if (p->old_duplex != dev->phydev->duplex) { status_changed = 1; - p->old_duplex = p->phy->duplex; + p->old_duplex = dev->phydev->duplex; } - if (p->old_pause != p->phy->pause) { + if (p->old_pause != dev->phydev->pause) { status_changed = 1; - p->old_pause = p->phy->pause; + p->old_pause = dev->phydev->pause; } if (ds->ops->adjust_link && status_changed) - ds->ops->adjust_link(ds, p->dp->index, p->phy); + ds->ops->adjust_link(ds, p->dp->index, dev->phydev); if (status_changed) - phy_print_status(p->phy); + phy_print_status(dev->phydev); } static int dsa_slave_fixed_link_update(struct net_device *dev, @@ -1107,28 +976,28 @@ static int dsa_slave_fixed_link_update(struct net_device *dev, } /* slave device setup *******************************************************/ -static int dsa_slave_phy_connect(struct dsa_slave_priv *p, - struct net_device *slave_dev, - int addr) +static int dsa_slave_phy_connect(struct net_device *slave_dev, int addr) { + struct dsa_slave_priv *p = netdev_priv(slave_dev); struct dsa_switch *ds = p->dp->ds; - p->phy = mdiobus_get_phy(ds->slave_mii_bus, addr); - if (!p->phy) { + slave_dev->phydev = mdiobus_get_phy(ds->slave_mii_bus, addr); + if (!slave_dev->phydev) { netdev_err(slave_dev, "no phy at %d\n", addr); return -ENODEV; } /* Use already configured phy mode */ if (p->phy_interface == PHY_INTERFACE_MODE_NA) - p->phy_interface = p->phy->interface; - return phy_connect_direct(slave_dev, p->phy, dsa_slave_adjust_link, - p->phy_interface); + p->phy_interface = slave_dev->phydev->interface; + + return phy_connect_direct(slave_dev, slave_dev->phydev, + dsa_slave_adjust_link, p->phy_interface); } -static int dsa_slave_phy_setup(struct dsa_slave_priv *p, - struct net_device *slave_dev) +static int dsa_slave_phy_setup(struct net_device *slave_dev) { + struct dsa_slave_priv *p = netdev_priv(slave_dev); struct dsa_switch *ds = p->dp->ds; struct device_node *phy_dn, *port_dn; bool phy_is_fixed = false; @@ -1168,30 +1037,31 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p, */ if (!phy_is_fixed && phy_id >= 0 && (ds->phys_mii_mask & (1 << phy_id))) { - ret = dsa_slave_phy_connect(p, slave_dev, phy_id); + ret = dsa_slave_phy_connect(slave_dev, phy_id); if (ret) { netdev_err(slave_dev, "failed to connect to phy%d: %d\n", phy_id, ret); of_node_put(phy_dn); return ret; } } else { - p->phy = of_phy_connect(slave_dev, phy_dn, - dsa_slave_adjust_link, - phy_flags, - p->phy_interface); + slave_dev->phydev = of_phy_connect(slave_dev, phy_dn, + dsa_slave_adjust_link, + phy_flags, + p->phy_interface); } of_node_put(phy_dn); } - if (p->phy && phy_is_fixed) - fixed_phy_set_link_update(p->phy, dsa_slave_fixed_link_update); + if (slave_dev->phydev && phy_is_fixed) + fixed_phy_set_link_update(slave_dev->phydev, + dsa_slave_fixed_link_update); /* We could not connect to a designated PHY, so use the switch internal * MDIO bus instead */ - if (!p->phy) { - ret = dsa_slave_phy_connect(p, slave_dev, p->dp->index); + if (!slave_dev->phydev) { + ret = dsa_slave_phy_connect(slave_dev, p->dp->index); if (ret) { netdev_err(slave_dev, "failed to connect to port %d: %d\n", p->dp->index, ret); @@ -1201,7 +1071,7 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p, } } - phy_attached_info(p->phy); + phy_attached_info(slave_dev->phydev); return 0; } @@ -1221,12 +1091,12 @@ int dsa_slave_suspend(struct net_device *slave_dev) netif_device_detach(slave_dev); - if (p->phy) { - phy_stop(p->phy); + if (slave_dev->phydev) { + phy_stop(slave_dev->phydev); p->old_pause = -1; p->old_link = -1; p->old_duplex = -1; - phy_suspend(p->phy); + phy_suspend(slave_dev->phydev); } return 0; @@ -1234,13 +1104,11 @@ int dsa_slave_suspend(struct net_device *slave_dev) int dsa_slave_resume(struct net_device *slave_dev) { - struct dsa_slave_priv *p = netdev_priv(slave_dev); - netif_device_attach(slave_dev); - if (p->phy) { - phy_resume(p->phy); - phy_start(p->phy); + if (slave_dev->phydev) { + phy_resume(slave_dev->phydev); + phy_start(slave_dev->phydev); } return 0; @@ -1249,7 +1117,6 @@ int dsa_slave_resume(struct net_device *slave_dev) int dsa_slave_create(struct dsa_port *port, const char *name) { struct dsa_switch *ds = port->ds; - struct dsa_switch_tree *dst = ds->dst; struct net_device *master; struct net_device *slave_dev; struct dsa_slave_priv *p; @@ -1294,35 +1161,40 @@ int dsa_slave_create(struct dsa_port *port, const char *name) } p->dp = port; INIT_LIST_HEAD(&p->mall_tc_list); - p->xmit = dst->tag_ops->xmit; + p->xmit = cpu_dp->tag_ops->xmit; p->old_pause = -1; p->old_link = -1; p->old_duplex = -1; port->netdev = slave_dev; - ret = register_netdev(slave_dev); - if (ret) { - netdev_err(master, "error %d registering interface %s\n", - ret, slave_dev->name); - port->netdev = NULL; - free_percpu(p->stats64); - free_netdev(slave_dev); - return ret; - } netif_carrier_off(slave_dev); - ret = dsa_slave_phy_setup(p, slave_dev); + ret = dsa_slave_phy_setup(slave_dev); if (ret) { netdev_err(master, "error %d setting up slave phy\n", ret); - unregister_netdev(slave_dev); - free_percpu(p->stats64); - free_netdev(slave_dev); - return ret; + goto out_free; + } + + ret = register_netdev(slave_dev); + if (ret) { + netdev_err(master, "error %d registering interface %s\n", + ret, slave_dev->name); + goto out_phy; } return 0; + +out_phy: + phy_disconnect(slave_dev->phydev); + if (of_phy_is_fixed_link(p->dp->dn)) + of_phy_deregister_fixed_link(p->dp->dn); +out_free: + free_percpu(p->stats64); + free_netdev(slave_dev); + port->netdev = NULL; + return ret; } void dsa_slave_destroy(struct net_device *slave_dev) @@ -1333,8 +1205,8 @@ void dsa_slave_destroy(struct net_device *slave_dev) port_dn = p->dp->dn; netif_carrier_off(slave_dev); - if (p->phy) { - phy_disconnect(p->phy); + if (slave_dev->phydev) { + phy_disconnect(slave_dev->phydev); if (of_phy_is_fixed_link(port_dn)) of_phy_deregister_fixed_link(port_dn); @@ -1374,7 +1246,7 @@ static int dsa_slave_netdevice_event(struct notifier_block *nb, { struct net_device *dev = netdev_notifier_info_to_dev(ptr); - if (dev->netdev_ops != &dsa_slave_netdev_ops) + if (!dsa_slave_dev_check(dev)) return NOTIFY_DONE; if (event == NETDEV_CHANGEUPPER) diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c index dbb016434ace..8e4bdb9d9ae3 100644 --- a/net/dsa/tag_brcm.c +++ b/net/dsa/tag_brcm.c @@ -92,9 +92,6 @@ static struct sk_buff *brcm_tag_xmit(struct sk_buff *skb, struct net_device *dev static struct sk_buff *brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) { - struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_port *cpu_dp = dsa_get_cpu_port(dst); - struct dsa_switch *ds = cpu_dp->ds; int source_port; u8 *brcm_tag; @@ -117,8 +114,8 @@ static struct sk_buff *brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev, /* Locate which port this is coming from */ source_port = brcm_tag[3] & BRCM_EG_PID_MASK; - /* Validate port against switch setup, either the port is totally */ - if (source_port >= ds->num_ports || !ds->ports[source_port].netdev) + skb->dev = dsa_master_get_slave(dev, 0, source_port); + if (!skb->dev) return NULL; /* Remove Broadcom tag and update checksum */ @@ -129,8 +126,6 @@ static struct sk_buff *brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev, skb->data - ETH_HLEN - BRCM_TAG_LEN, 2 * ETH_ALEN); - skb->dev = ds->ports[source_port].netdev; - return skb; } diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c index fbf9ca954773..c77218f173d1 100644 --- a/net/dsa/tag_dsa.c +++ b/net/dsa/tag_dsa.c @@ -67,8 +67,6 @@ static struct sk_buff *dsa_xmit(struct sk_buff *skb, struct net_device *dev) static struct sk_buff *dsa_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) { - struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_switch *ds; u8 *dsa_header; int source_device; int source_port; @@ -93,18 +91,8 @@ static struct sk_buff *dsa_rcv(struct sk_buff *skb, struct net_device *dev, source_device = dsa_header[0] & 0x1f; source_port = (dsa_header[1] >> 3) & 0x1f; - /* - * Check that the source device exists and that the source - * port is a registered DSA port. - */ - if (source_device >= DSA_MAX_SWITCHES) - return NULL; - - ds = dst->ds[source_device]; - if (!ds) - return NULL; - - if (source_port >= ds->num_ports || !ds->ports[source_port].netdev) + skb->dev = dsa_master_get_slave(dev, source_device, source_port); + if (!skb->dev) return NULL; /* @@ -153,8 +141,6 @@ static struct sk_buff *dsa_rcv(struct sk_buff *skb, struct net_device *dev, 2 * ETH_ALEN); } - skb->dev = ds->ports[source_port].netdev; - return skb; } diff --git a/net/dsa/tag_edsa.c b/net/dsa/tag_edsa.c index 76367ba1b2e2..0b83cbe0c9e8 100644 --- a/net/dsa/tag_edsa.c +++ b/net/dsa/tag_edsa.c @@ -80,8 +80,6 @@ static struct sk_buff *edsa_xmit(struct sk_buff *skb, struct net_device *dev) static struct sk_buff *edsa_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) { - struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_switch *ds; u8 *edsa_header; int source_device; int source_port; @@ -106,18 +104,8 @@ static struct sk_buff *edsa_rcv(struct sk_buff *skb, struct net_device *dev, source_device = edsa_header[0] & 0x1f; source_port = (edsa_header[1] >> 3) & 0x1f; - /* - * Check that the source device exists and that the source - * port is a registered DSA port. - */ - if (source_device >= DSA_MAX_SWITCHES) - return NULL; - - ds = dst->ds[source_device]; - if (!ds) - return NULL; - - if (source_port >= ds->num_ports || !ds->ports[source_port].netdev) + skb->dev = dsa_master_get_slave(dev, source_device, source_port); + if (!skb->dev) return NULL; /* @@ -172,8 +160,6 @@ static struct sk_buff *edsa_rcv(struct sk_buff *skb, struct net_device *dev, 2 * ETH_ALEN); } - skb->dev = ds->ports[source_port].netdev; - return skb; } diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c index 010ca0a336c4..b241c990cde0 100644 --- a/net/dsa/tag_ksz.c +++ b/net/dsa/tag_ksz.c @@ -80,22 +80,19 @@ static struct sk_buff *ksz_xmit(struct sk_buff *skb, struct net_device *dev) static struct sk_buff *ksz_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) { - struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_port *cpu_dp = dsa_get_cpu_port(dst); - struct dsa_switch *ds = cpu_dp->ds; u8 *tag; int source_port; tag = skb_tail_pointer(skb) - KSZ_EGRESS_TAG_LEN; source_port = tag[0] & 7; - if (source_port >= ds->num_ports || !ds->ports[source_port].netdev) + + skb->dev = dsa_master_get_slave(dev, 0, source_port); + if (!skb->dev) return NULL; pskb_trim_rcsum(skb, skb->len - KSZ_EGRESS_TAG_LEN); - skb->dev = ds->ports[source_port].netdev; - return skb; } diff --git a/net/dsa/tag_lan9303.c b/net/dsa/tag_lan9303.c index 0b9826105e42..4f211e56c81f 100644 --- a/net/dsa/tag_lan9303.c +++ b/net/dsa/tag_lan9303.c @@ -71,17 +71,8 @@ static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) { u16 *lan9303_tag; - struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_switch *ds; unsigned int source_port; - ds = dst->ds[0]; - - if (unlikely(!ds)) { - dev_warn_ratelimited(&dev->dev, "Dropping packet, due to missing DSA switch device\n"); - return NULL; - } - if (unlikely(!pskb_may_pull(skb, LAN9303_TAG_LEN))) { dev_warn_ratelimited(&dev->dev, "Dropping packet, cannot pull\n"); @@ -103,16 +94,12 @@ static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev, source_port = ntohs(lan9303_tag[1]) & 0x3; - if (source_port >= ds->num_ports) { + skb->dev = dsa_master_get_slave(dev, 0, source_port); + if (!skb->dev) { dev_warn_ratelimited(&dev->dev, "Dropping packet due to invalid source port\n"); return NULL; } - if (!ds->ports[source_port].netdev) { - dev_warn_ratelimited(&dev->dev, "Dropping packet due to invalid netdev or device\n"); - return NULL; - } - /* remove the special VLAN tag between the MAC addresses * and the current ethertype field. */ @@ -120,9 +107,6 @@ static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev, memmove(skb->data - ETH_HLEN, skb->data - (ETH_HLEN + LAN9303_TAG_LEN), 2 * ETH_ALEN); - /* forward the packet to the dedicated interface */ - skb->dev = ds->ports[source_port].netdev; - return skb; } diff --git a/net/dsa/tag_mtk.c b/net/dsa/tag_mtk.c index ec8ee5f43255..968586c5d40e 100644 --- a/net/dsa/tag_mtk.c +++ b/net/dsa/tag_mtk.c @@ -46,8 +46,6 @@ static struct sk_buff *mtk_tag_xmit(struct sk_buff *skb, static struct sk_buff *mtk_tag_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) { - struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_switch *ds; int port; __be16 *phdr, hdr; @@ -68,20 +66,12 @@ static struct sk_buff *mtk_tag_rcv(struct sk_buff *skb, struct net_device *dev, skb->data - ETH_HLEN - MTK_HDR_LEN, 2 * ETH_ALEN); - /* This protocol doesn't support cascading multiple - * switches so it's safe to assume the switch is first - * in the tree. - */ - ds = dst->ds[0]; - if (!ds) - return NULL; - /* Get source port information */ port = (hdr & MTK_HDR_RECV_SOURCE_PORT_MASK); - if (!ds->ports[port].netdev) - return NULL; - skb->dev = ds->ports[port].netdev; + skb->dev = dsa_master_get_slave(dev, 0, port); + if (!skb->dev) + return NULL; return skb; } diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c index 1d4c70711c0f..8d33d9ebf910 100644 --- a/net/dsa/tag_qca.c +++ b/net/dsa/tag_qca.c @@ -65,9 +65,6 @@ static struct sk_buff *qca_tag_xmit(struct sk_buff *skb, struct net_device *dev) static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) { - struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_port *cpu_dp = dsa_get_cpu_port(dst); - struct dsa_switch *ds; u8 ver; int port; __be16 *phdr, hdr; @@ -92,20 +89,12 @@ static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev, memmove(skb->data - ETH_HLEN, skb->data - ETH_HLEN - QCA_HDR_LEN, ETH_HLEN - QCA_HDR_LEN); - /* This protocol doesn't support cascading multiple switches so it's - * safe to assume the switch is first in the tree - */ - ds = cpu_dp->ds; - if (!ds) - return NULL; - /* Get source port information */ port = (hdr & QCA_HDR_RECV_SOURCE_PORT_MASK); - if (!ds->ports[port].netdev) - return NULL; - /* Update skb & forward the frame accordingly */ - skb->dev = ds->ports[port].netdev; + skb->dev = dsa_master_get_slave(dev, 0, port); + if (!skb->dev) + return NULL; return skb; } diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c index d2fd4923aa3e..61668be267f5 100644 --- a/net/dsa/tag_trailer.c +++ b/net/dsa/tag_trailer.c @@ -58,9 +58,6 @@ static struct sk_buff *trailer_xmit(struct sk_buff *skb, struct net_device *dev) static struct sk_buff *trailer_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) { - struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_port *cpu_dp = dsa_get_cpu_port(dst); - struct dsa_switch *ds = cpu_dp->ds; u8 *trailer; int source_port; @@ -73,13 +70,13 @@ static struct sk_buff *trailer_rcv(struct sk_buff *skb, struct net_device *dev, return NULL; source_port = trailer[1] & 7; - if (source_port >= ds->num_ports || !ds->ports[source_port].netdev) + + skb->dev = dsa_master_get_slave(dev, 0, source_port); + if (!skb->dev) return NULL; pskb_trim_rcsum(skb, skb->len - 4); - skb->dev = ds->ports[source_port].netdev; - return skb; } diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index e31108e5ef79..43a1bbed7a42 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -195,7 +195,7 @@ int inet_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; unsigned char old_state; - int err; + int err, tcp_fastopen; lock_sock(sk); @@ -217,11 +217,12 @@ int inet_listen(struct socket *sock, int backlog) * because the socket was in TCP_LISTEN state previously but * was shutdown() rather than close(). */ - if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) && - (sysctl_tcp_fastopen & TFO_SERVER_ENABLE) && + tcp_fastopen = sock_net(sk)->ipv4.sysctl_tcp_fastopen; + if ((tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) && + (tcp_fastopen & TFO_SERVER_ENABLE) && !inet_csk(sk)->icsk_accept_queue.fastopenq.max_qlen) { fastopen_queue_tune(sk, backlog); - tcp_fastopen_init_key_once(true); + tcp_fastopen_init_key_once(sock_net(sk)); } err = inet_csk_listen_start(sk, backlog); diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index d7adc0616599..7ce22a2c07ce 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -137,22 +137,12 @@ static void inet_hash_remove(struct in_ifaddr *ifa) */ struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref) { - u32 hash = inet_addr_hash(net, addr); struct net_device *result = NULL; struct in_ifaddr *ifa; rcu_read_lock(); - hlist_for_each_entry_rcu(ifa, &inet_addr_lst[hash], hash) { - if (ifa->ifa_local == addr) { - struct net_device *dev = ifa->ifa_dev->dev; - - if (!net_eq(dev_net(dev), net)) - continue; - result = dev; - break; - } - } - if (!result) { + ifa = inet_lookup_ifaddr_rcu(net, addr); + if (!ifa) { struct flowi4 fl4 = { .daddr = addr }; struct fib_result res = { 0 }; struct fib_table *local; @@ -165,6 +155,8 @@ struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref) !fib_table_lookup(local, &fl4, &res, FIB_LOOKUP_NOREF) && res.type == RTN_LOCAL) result = FIB_RES_DEV(res); + } else { + result = ifa->ifa_dev->dev; } if (result && devref) dev_hold(result); @@ -173,6 +165,20 @@ struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref) } EXPORT_SYMBOL(__ip_dev_find); +/* called under RCU lock */ +struct in_ifaddr *inet_lookup_ifaddr_rcu(struct net *net, __be32 addr) +{ + u32 hash = inet_addr_hash(net, addr); + struct in_ifaddr *ifa; + + hlist_for_each_entry_rcu(ifa, &inet_addr_lst[hash], hash) + if (ifa->ifa_local == addr && + net_eq(dev_net(ifa->ifa_dev->dev), net)) + return ifa; + + return NULL; +} + static void rtmsg_ifa(int event, struct in_ifaddr *, struct nlmsghdr *, u32); static BLOCKING_NOTIFIER_HEAD(inetaddr_chain); diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 37819ab4cc74..f02819134ba2 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -345,9 +345,6 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, if (res.type != RTN_UNICAST && (res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev))) goto e_inval; - if (!rpf && !fib_num_tclassid_users(net) && - (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) - goto last_resort; fib_combine_itag(itag, &res); dev_match = false; @@ -402,13 +399,26 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, struct in_device *idev, u32 *itag) { int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev); + struct net *net = dev_net(dev); - if (!r && !fib_num_tclassid_users(dev_net(dev)) && - IN_DEV_ACCEPT_LOCAL(idev) && + if (!r && !fib_num_tclassid_users(net) && (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) { + if (IN_DEV_ACCEPT_LOCAL(idev)) + goto ok; + /* if no local routes are added from user space we can check + * for local addresses looking-up the ifaddr table + */ + if (net->ipv4.fib_has_custom_local_routes) + goto full_check; + if (inet_lookup_ifaddr_rcu(net, src)) + return -EINVAL; + +ok: *itag = 0; return 0; } + +full_check: return __fib_validate_source(skb, src, dst, tos, oif, dev, r, idev, itag); } @@ -759,6 +769,8 @@ static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, } err = fib_table_insert(net, tb, &cfg, extack); + if (!err && cfg.fc_type == RTN_LOCAL) + net->ipv4.fib_has_custom_local_routes = true; errout: return err; } diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 57a5d48acee8..467b3c15395f 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -601,17 +601,9 @@ static void fib_rebalance(struct fib_info *fi) atomic_set(&nexthop_nh->nh_upper_bound, upper_bound); } endfor_nexthops(fi); } - -static inline void fib_add_weight(struct fib_info *fi, - const struct fib_nh *nh) -{ - fi->fib_weight += nh->nh_weight; -} - #else /* CONFIG_IP_ROUTE_MULTIPATH */ #define fib_rebalance(fi) do { } while (0) -#define fib_add_weight(fi, nh) do { } while (0) #endif /* CONFIG_IP_ROUTE_MULTIPATH */ @@ -774,8 +766,8 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi) * | * |-> {local prefix} (terminal node) */ -static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, - struct fib_nh *nh, struct netlink_ext_ack *extack) +static int fib_check_nh(struct fib_config *cfg, struct fib_nh *nh, + struct netlink_ext_ack *extack) { int err = 0; struct net *net; @@ -1258,7 +1250,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg, int linkdown = 0; change_nexthops(fi) { - err = fib_check_nh(cfg, fi, nexthop_nh, extack); + err = fib_check_nh(cfg, nexthop_nh, extack); if (err != 0) goto failure; if (nexthop_nh->nh_flags & RTNH_F_LINKDOWN) @@ -1275,7 +1267,6 @@ struct fib_info *fib_create_info(struct fib_config *cfg, change_nexthops(fi) { fib_info_update_nh_saddr(net, nexthop_nh); - fib_add_weight(fi, nexthop_nh); } endfor_nexthops(fi) fib_rebalance(fi); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 9707372b78ed..8a91ebbf0c01 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -266,7 +266,7 @@ static inline int sk_reuseport_match(struct inet_bind_bucket *tb, #if IS_ENABLED(CONFIG_IPV6) if (tb->fast_sk_family == AF_INET6) return ipv6_rcv_saddr_equal(&tb->fast_v6_rcv_saddr, - &sk->sk_v6_rcv_saddr, + inet6_rcv_saddr(sk), tb->fast_rcv_saddr, sk->sk_rcv_saddr, tb->fast_ipv6_only, @@ -321,13 +321,14 @@ tb_found: goto fail_unlock; } success: - if (!hlist_empty(&tb->owners)) { + if (hlist_empty(&tb->owners)) { tb->fastreuse = reuse; if (sk->sk_reuseport) { tb->fastreuseport = FASTREUSEPORT_ANY; tb->fastuid = uid; tb->fast_rcv_saddr = sk->sk_rcv_saddr; tb->fast_ipv6_only = ipv6_only_sock(sk); + tb->fast_sk_family = sk->sk_family; #if IS_ENABLED(CONFIG_IPV6) tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr; #endif @@ -354,6 +355,7 @@ success: tb->fastuid = uid; tb->fast_rcv_saddr = sk->sk_rcv_saddr; tb->fast_ipv6_only = ipv6_only_sock(sk); + tb->fast_sk_family = sk->sk_family; #if IS_ENABLED(CONFIG_IPV6) tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr; #endif diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index e7eb590c86ce..914d56928578 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -128,9 +128,9 @@ static struct inet_peer *lookup(const struct inetpeer_addr *daddr, break; } if (cmp == -1) - pp = &(*pp)->rb_left; + pp = &next->rb_left; else - pp = &(*pp)->rb_right; + pp = &next->rb_right; } *parent_p = parent; *pp_p = pp; @@ -284,14 +284,17 @@ EXPORT_SYMBOL(inet_peer_xrlim_allow); void inetpeer_invalidate_tree(struct inet_peer_base *base) { - struct inet_peer *p, *n; + struct rb_node *p = rb_first(&base->rb_root); - rbtree_postorder_for_each_entry_safe(p, n, &base->rb_root, rb_node) { - inet_putpeer(p); + while (p) { + struct inet_peer *peer = rb_entry(p, struct inet_peer, rb_node); + + p = rb_next(p); + rb_erase(&peer->rb_node, &base->rb_root); + inet_putpeer(peer); cond_resched(); } - base->rb_root = RB_ROOT; base->total = 0; } EXPORT_SYMBOL(inetpeer_invalidate_tree); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 0162fb955b33..dc2317776499 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -259,7 +259,6 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, struct ip_tunnel *tunnel; struct erspanhdr *ershdr; const struct iphdr *iph; - __be32 session_id; __be32 index; int len; @@ -275,8 +274,7 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, /* The original GRE header does not have key field, * Use ERSPAN 10-bit session ID as key. */ - session_id = cpu_to_be32(ntohs(ershdr->session_id)); - tpi->key = session_id; + tpi->key = cpu_to_be32(ntohs(ershdr->session_id) & ID_MASK); index = ershdr->md.index; tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags | TUNNEL_KEY, @@ -733,7 +731,7 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb, if (skb_cow_head(skb, dev->needed_headroom)) goto free_skb; - if (skb->len > dev->mtu) { + if (skb->len - dev->hard_header_len > dev->mtu) { pskb_trim(skb, dev->mtu); truncate = true; } @@ -1013,15 +1011,14 @@ static int __net_init ipgre_init_net(struct net *net) return ip_tunnel_init_net(net, ipgre_net_id, &ipgre_link_ops, NULL); } -static void __net_exit ipgre_exit_net(struct net *net) +static void __net_exit ipgre_exit_batch_net(struct list_head *list_net) { - struct ip_tunnel_net *itn = net_generic(net, ipgre_net_id); - ip_tunnel_delete_net(itn, &ipgre_link_ops); + ip_tunnel_delete_nets(list_net, ipgre_net_id, &ipgre_link_ops); } static struct pernet_operations ipgre_net_ops = { .init = ipgre_init_net, - .exit = ipgre_exit_net, + .exit_batch = ipgre_exit_batch_net, .id = &ipgre_net_id, .size = sizeof(struct ip_tunnel_net), }; @@ -1223,6 +1220,7 @@ static int gre_tap_init(struct net_device *dev) { __gre_tunnel_init(dev); dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + netif_keep_dst(dev); return ip_tunnel_init(dev); } @@ -1246,13 +1244,16 @@ static int erspan_tunnel_init(struct net_device *dev) tunnel->tun_hlen = 8; tunnel->parms.iph.protocol = IPPROTO_GRE; - t_hlen = tunnel->hlen + sizeof(struct iphdr) + sizeof(struct erspanhdr); + tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen + + sizeof(struct erspanhdr); + t_hlen = tunnel->hlen + sizeof(struct iphdr); dev->needed_headroom = LL_MAX_HEADER + t_hlen + 4; dev->mtu = ETH_DATA_LEN - t_hlen - 4; dev->features |= GRE_FEATURES; dev->hw_features |= GRE_FEATURES; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + netif_keep_dst(dev); return ip_tunnel_init(dev); } @@ -1540,15 +1541,14 @@ static int __net_init ipgre_tap_init_net(struct net *net) return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, "gretap0"); } -static void __net_exit ipgre_tap_exit_net(struct net *net) +static void __net_exit ipgre_tap_exit_batch_net(struct list_head *list_net) { - struct ip_tunnel_net *itn = net_generic(net, gre_tap_net_id); - ip_tunnel_delete_net(itn, &ipgre_tap_ops); + ip_tunnel_delete_nets(list_net, gre_tap_net_id, &ipgre_tap_ops); } static struct pernet_operations ipgre_tap_net_ops = { .init = ipgre_tap_init_net, - .exit = ipgre_tap_exit_net, + .exit_batch = ipgre_tap_exit_batch_net, .id = &gre_tap_net_id, .size = sizeof(struct ip_tunnel_net), }; @@ -1559,16 +1559,14 @@ static int __net_init erspan_init_net(struct net *net) &erspan_link_ops, "erspan0"); } -static void __net_exit erspan_exit_net(struct net *net) +static void __net_exit erspan_exit_batch_net(struct list_head *net_list) { - struct ip_tunnel_net *itn = net_generic(net, erspan_net_id); - - ip_tunnel_delete_net(itn, &erspan_link_ops); + ip_tunnel_delete_nets(net_list, erspan_net_id, &erspan_link_ops); } static struct pernet_operations erspan_net_ops = { .init = erspan_init_net, - .exit = erspan_exit_net, + .exit_batch = erspan_exit_batch_net, .id = &erspan_net_id, .size = sizeof(struct ip_tunnel_net), }; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index fa2dc8f692c6..57fc13c6ab2b 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -311,9 +311,10 @@ drop: static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { const struct iphdr *iph = ip_hdr(skb); - struct rtable *rt; + int (*edemux)(struct sk_buff *skb); struct net_device *dev = skb->dev; - void (*edemux)(struct sk_buff *skb); + struct rtable *rt; + int err; /* if ingress device is enslaved to an L3 master device pass the * skb to its handler for processing @@ -331,7 +332,9 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) ipprot = rcu_dereference(inet_protos[protocol]); if (ipprot && (edemux = READ_ONCE(ipprot->early_demux))) { - edemux(skb); + err = edemux(skb); + if (unlikely(err)) + goto drop_error; /* must reload iph, skb->head might have changed */ iph = ip_hdr(skb); } @@ -342,13 +345,10 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) * how the packet travels inside Linux networking. */ if (!skb_valid_dst(skb)) { - int err = ip_route_input_noref(skb, iph->daddr, iph->saddr, - iph->tos, dev); - if (unlikely(err)) { - if (err == -EXDEV) - __NET_INC_STATS(net, LINUX_MIB_IPRPFILTER); - goto drop; - } + err = ip_route_input_noref(skb, iph->daddr, iph->saddr, + iph->tos, dev); + if (unlikely(err)) + goto drop_error; } #ifdef CONFIG_IP_ROUTE_CLASSID @@ -399,6 +399,11 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) drop: kfree_skb(skb); return NET_RX_DROP; + +drop_error: + if (err == -EXDEV) + __NET_INC_STATS(net, LINUX_MIB_IPRPFILTER); + goto drop; } /* diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index e9805ad664ac..fe6fee728ce4 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -1061,16 +1061,22 @@ static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head, } } -void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops) +void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id, + struct rtnl_link_ops *ops) { + struct ip_tunnel_net *itn; + struct net *net; LIST_HEAD(list); rtnl_lock(); - ip_tunnel_destroy(itn, &list, ops); + list_for_each_entry(net, net_list, exit_list) { + itn = net_generic(net, id); + ip_tunnel_destroy(itn, &list, ops); + } unregister_netdevice_many(&list); rtnl_unlock(); } -EXPORT_SYMBOL_GPL(ip_tunnel_delete_net); +EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets); int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], struct ip_tunnel_parm *p, __u32 fwmark) diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 5ed63d250950..58465c0a8682 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -168,6 +168,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, struct ip_tunnel_parm *parms = &tunnel->parms; struct dst_entry *dst = skb_dst(skb); struct net_device *tdev; /* Device to other host */ + int pkt_len = skb->len; int err; int mtu; @@ -229,7 +230,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, err = dst_output(tunnel->net, skb->sk, skb); if (net_xmit_eval(err) == 0) - err = skb->len; + err = pkt_len; iptunnel_xmit_stats(dev, err); return NETDEV_TX_OK; @@ -452,15 +453,14 @@ static int __net_init vti_init_net(struct net *net) return 0; } -static void __net_exit vti_exit_net(struct net *net) +static void __net_exit vti_exit_batch_net(struct list_head *list_net) { - struct ip_tunnel_net *itn = net_generic(net, vti_net_id); - ip_tunnel_delete_net(itn, &vti_link_ops); + ip_tunnel_delete_nets(list_net, vti_net_id, &vti_link_ops); } static struct pernet_operations vti_net_ops = { .init = vti_init_net, - .exit = vti_exit_net, + .exit_batch = vti_exit_batch_net, .id = &vti_net_id, .size = sizeof(struct ip_tunnel_net), }; diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index fb1ad22b5e29..1e47818e38c7 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -634,15 +634,14 @@ static int __net_init ipip_init_net(struct net *net) return ip_tunnel_init_net(net, ipip_net_id, &ipip_link_ops, "tunl0"); } -static void __net_exit ipip_exit_net(struct net *net) +static void __net_exit ipip_exit_batch_net(struct list_head *list_net) { - struct ip_tunnel_net *itn = net_generic(net, ipip_net_id); - ip_tunnel_delete_net(itn, &ipip_link_ops); + ip_tunnel_delete_nets(list_net, ipip_net_id, &ipip_link_ops); } static struct pernet_operations ipip_net_ops = { .init = ipip_init_net, - .exit = ipip_exit_net, + .exit_batch = ipip_exit_batch_net, .id = &ipip_net_id, .size = sizeof(struct ip_tunnel_net), }; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index c9b3e6e069ae..b3ee01b0551b 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -67,6 +67,7 @@ #include <net/fib_rules.h> #include <linux/netconf.h> #include <net/nexthop.h> +#include <net/switchdev.h> struct ipmr_rule { struct fib_rule common; @@ -264,6 +265,22 @@ static void __net_exit ipmr_rules_exit(struct net *net) fib_rules_unregister(net->ipv4.mr_rules_ops); rtnl_unlock(); } + +static int ipmr_rules_dump(struct net *net, struct notifier_block *nb) +{ + return fib_rules_dump(net, nb, RTNL_FAMILY_IPMR); +} + +static unsigned int ipmr_rules_seq_read(struct net *net) +{ + return fib_rules_seq_read(net, RTNL_FAMILY_IPMR); +} + +bool ipmr_rule_default(const struct fib_rule *rule) +{ + return fib_rule_matchall(rule) && rule->table == RT_TABLE_DEFAULT; +} +EXPORT_SYMBOL(ipmr_rule_default); #else #define ipmr_for_each_table(mrt, net) \ for (mrt = net->ipv4.mrt; mrt; mrt = NULL) @@ -298,6 +315,22 @@ static void __net_exit ipmr_rules_exit(struct net *net) net->ipv4.mrt = NULL; rtnl_unlock(); } + +static int ipmr_rules_dump(struct net *net, struct notifier_block *nb) +{ + return 0; +} + +static unsigned int ipmr_rules_seq_read(struct net *net) +{ + return 0; +} + +bool ipmr_rule_default(const struct fib_rule *rule) +{ + return true; +} +EXPORT_SYMBOL(ipmr_rule_default); #endif static inline int ipmr_hash_cmp(struct rhashtable_compare_arg *arg, @@ -587,6 +620,82 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt) } #endif +static int call_ipmr_vif_entry_notifier(struct notifier_block *nb, + struct net *net, + enum fib_event_type event_type, + struct vif_device *vif, + vifi_t vif_index, u32 tb_id) +{ + struct vif_entry_notifier_info info = { + .info = { + .family = RTNL_FAMILY_IPMR, + .net = net, + }, + .dev = vif->dev, + .vif_index = vif_index, + .vif_flags = vif->flags, + .tb_id = tb_id, + }; + + return call_fib_notifier(nb, net, event_type, &info.info); +} + +static int call_ipmr_vif_entry_notifiers(struct net *net, + enum fib_event_type event_type, + struct vif_device *vif, + vifi_t vif_index, u32 tb_id) +{ + struct vif_entry_notifier_info info = { + .info = { + .family = RTNL_FAMILY_IPMR, + .net = net, + }, + .dev = vif->dev, + .vif_index = vif_index, + .vif_flags = vif->flags, + .tb_id = tb_id, + }; + + ASSERT_RTNL(); + net->ipv4.ipmr_seq++; + return call_fib_notifiers(net, event_type, &info.info); +} + +static int call_ipmr_mfc_entry_notifier(struct notifier_block *nb, + struct net *net, + enum fib_event_type event_type, + struct mfc_cache *mfc, u32 tb_id) +{ + struct mfc_entry_notifier_info info = { + .info = { + .family = RTNL_FAMILY_IPMR, + .net = net, + }, + .mfc = mfc, + .tb_id = tb_id + }; + + return call_fib_notifier(nb, net, event_type, &info.info); +} + +static int call_ipmr_mfc_entry_notifiers(struct net *net, + enum fib_event_type event_type, + struct mfc_cache *mfc, u32 tb_id) +{ + struct mfc_entry_notifier_info info = { + .info = { + .family = RTNL_FAMILY_IPMR, + .net = net, + }, + .mfc = mfc, + .tb_id = tb_id + }; + + ASSERT_RTNL(); + net->ipv4.ipmr_seq++; + return call_fib_notifiers(net, event_type, &info.info); +} + /** * vif_delete - Delete a VIF entry * @notify: Set to 1, if the caller is a notifier_call @@ -594,6 +703,7 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt) static int vif_delete(struct mr_table *mrt, int vifi, int notify, struct list_head *head) { + struct net *net = read_pnet(&mrt->net); struct vif_device *v; struct net_device *dev; struct in_device *in_dev; @@ -603,6 +713,10 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify, v = &mrt->vif_table[vifi]; + if (VIF_EXISTS(mrt, vifi)) + call_ipmr_vif_entry_notifiers(net, FIB_EVENT_VIF_DEL, v, vifi, + mrt->id); + write_lock_bh(&mrt_lock); dev = v->dev; v->dev = NULL; @@ -652,10 +766,11 @@ static void ipmr_cache_free_rcu(struct rcu_head *head) kmem_cache_free(mrt_cachep, c); } -static inline void ipmr_cache_free(struct mfc_cache *c) +void ipmr_cache_free(struct mfc_cache *c) { call_rcu(&c->rcu, ipmr_cache_free_rcu); } +EXPORT_SYMBOL(ipmr_cache_free); /* Destroy an unresolved cache entry, killing queued skbs * and reporting error to netlink readers. @@ -754,6 +869,9 @@ static int vif_add(struct net *net, struct mr_table *mrt, struct vifctl *vifc, int mrtsock) { int vifi = vifc->vifc_vifi; + struct switchdev_attr attr = { + .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID, + }; struct vif_device *v = &mrt->vif_table[vifi]; struct net_device *dev; struct in_device *in_dev; @@ -828,6 +946,13 @@ static int vif_add(struct net *net, struct mr_table *mrt, /* Fill in the VIF structures */ + attr.orig_dev = dev; + if (!switchdev_port_attr_get(dev, &attr)) { + memcpy(v->dev_parent_id.id, attr.u.ppid.id, attr.u.ppid.id_len); + v->dev_parent_id.id_len = attr.u.ppid.id_len; + } else { + v->dev_parent_id.id_len = 0; + } v->rate_limit = vifc->vifc_rate_limit; v->local = vifc->vifc_lcl_addr.s_addr; v->remote = vifc->vifc_rmt_addr.s_addr; @@ -851,6 +976,7 @@ static int vif_add(struct net *net, struct mr_table *mrt, if (vifi+1 > mrt->maxvif) mrt->maxvif = vifi+1; write_unlock_bh(&mrt_lock); + call_ipmr_vif_entry_notifiers(net, FIB_EVENT_VIF_ADD, v, vifi, mrt->id); return 0; } @@ -949,6 +1075,7 @@ static struct mfc_cache *ipmr_cache_alloc(void) if (c) { c->mfc_un.res.last_assert = jiffies - MFC_ASSERT_THRESH - 1; c->mfc_un.res.minvif = MAXVIFS; + refcount_set(&c->mfc_un.res.refcount, 1); } return c; } @@ -1150,6 +1277,7 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent) { + struct net *net = read_pnet(&mrt->net); struct mfc_cache *c; /* The entries are added/deleted only under RTNL */ @@ -1161,8 +1289,9 @@ static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent) return -ENOENT; rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params); list_del_rcu(&c->list); + call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, c, mrt->id); mroute_netlink_event(mrt, c, RTM_DELROUTE); - ipmr_cache_free(c); + ipmr_cache_put(c); return 0; } @@ -1189,6 +1318,8 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, if (!mrtsock) c->mfc_flags |= MFC_STATIC; write_unlock_bh(&mrt_lock); + call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, c, + mrt->id); mroute_netlink_event(mrt, c, RTM_NEWROUTE); return 0; } @@ -1238,6 +1369,7 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, ipmr_cache_resolve(net, mrt, uc, c); ipmr_cache_free(uc); } + call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_ADD, c, mrt->id); mroute_netlink_event(mrt, c, RTM_NEWROUTE); return 0; } @@ -1245,6 +1377,7 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, /* Close the multicast socket, and clear the vif tables etc */ static void mroute_clean_tables(struct mr_table *mrt, bool all) { + struct net *net = read_pnet(&mrt->net); struct mfc_cache *c, *tmp; LIST_HEAD(list); int i; @@ -1263,8 +1396,10 @@ static void mroute_clean_tables(struct mr_table *mrt, bool all) continue; rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params); list_del_rcu(&c->list); + call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, c, + mrt->id); mroute_netlink_event(mrt, c, RTM_DELROUTE); - ipmr_cache_free(c); + ipmr_cache_put(c); } if (atomic_read(&mrt->cache_resolve_queue_len) != 0) { @@ -1724,10 +1859,33 @@ static inline int ipmr_forward_finish(struct net *net, struct sock *sk, return dst_output(net, sk, skb); } +#ifdef CONFIG_NET_SWITCHDEV +static bool ipmr_forward_offloaded(struct sk_buff *skb, struct mr_table *mrt, + int in_vifi, int out_vifi) +{ + struct vif_device *out_vif = &mrt->vif_table[out_vifi]; + struct vif_device *in_vif = &mrt->vif_table[in_vifi]; + + if (!skb->offload_mr_fwd_mark) + return false; + if (!out_vif->dev_parent_id.id_len || !in_vif->dev_parent_id.id_len) + return false; + return netdev_phys_item_id_same(&out_vif->dev_parent_id, + &in_vif->dev_parent_id); +} +#else +static bool ipmr_forward_offloaded(struct sk_buff *skb, struct mr_table *mrt, + int in_vifi, int out_vifi) +{ + return false; +} +#endif + /* Processing handlers for ipmr_forward */ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, - struct sk_buff *skb, struct mfc_cache *c, int vifi) + int in_vifi, struct sk_buff *skb, + struct mfc_cache *c, int vifi) { const struct iphdr *iph = ip_hdr(skb); struct vif_device *vif = &mrt->vif_table[vifi]; @@ -1748,6 +1906,9 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, goto out_free; } + if (ipmr_forward_offloaded(skb, mrt, in_vifi, vifi)) + goto out_free; + if (vif->flags & VIFF_TUNNEL) { rt = ip_route_output_ports(net, &fl4, NULL, vif->remote, vif->local, @@ -1925,8 +2086,8 @@ forward: struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) - ipmr_queue_xmit(net, mrt, skb2, cache, - psend); + ipmr_queue_xmit(net, mrt, true_vifi, + skb2, cache, psend); } psend = ct; } @@ -1937,9 +2098,10 @@ last_forward: struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) - ipmr_queue_xmit(net, mrt, skb2, cache, psend); + ipmr_queue_xmit(net, mrt, true_vifi, skb2, + cache, psend); } else { - ipmr_queue_xmit(net, mrt, skb, cache, psend); + ipmr_queue_xmit(net, mrt, true_vifi, skb, cache, psend); return; } } @@ -2156,6 +2318,9 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, nla_put_u32(skb, RTA_IIF, mrt->vif_table[c->mfc_parent].dev->ifindex) < 0) return -EMSGSIZE; + if (c->mfc_flags & MFC_OFFLOAD) + rtm->rtm_flags |= RTNH_F_OFFLOAD; + if (!(mp_attr = nla_nest_start(skb, RTA_MULTIPATH))) return -EMSGSIZE; @@ -3048,14 +3213,87 @@ static const struct net_protocol pim_protocol = { }; #endif +static unsigned int ipmr_seq_read(struct net *net) +{ + ASSERT_RTNL(); + + return net->ipv4.ipmr_seq + ipmr_rules_seq_read(net); +} + +static int ipmr_dump(struct net *net, struct notifier_block *nb) +{ + struct mr_table *mrt; + int err; + + err = ipmr_rules_dump(net, nb); + if (err) + return err; + + ipmr_for_each_table(mrt, net) { + struct vif_device *v = &mrt->vif_table[0]; + struct mfc_cache *mfc; + int vifi; + + /* Notifiy on table VIF entries */ + read_lock(&mrt_lock); + for (vifi = 0; vifi < mrt->maxvif; vifi++, v++) { + if (!v->dev) + continue; + + call_ipmr_vif_entry_notifier(nb, net, FIB_EVENT_VIF_ADD, + v, vifi, mrt->id); + } + read_unlock(&mrt_lock); + + /* Notify on table MFC entries */ + list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) + call_ipmr_mfc_entry_notifier(nb, net, + FIB_EVENT_ENTRY_ADD, mfc, + mrt->id); + } + + return 0; +} + +static const struct fib_notifier_ops ipmr_notifier_ops_template = { + .family = RTNL_FAMILY_IPMR, + .fib_seq_read = ipmr_seq_read, + .fib_dump = ipmr_dump, + .owner = THIS_MODULE, +}; + +static int __net_init ipmr_notifier_init(struct net *net) +{ + struct fib_notifier_ops *ops; + + net->ipv4.ipmr_seq = 0; + + ops = fib_notifier_ops_register(&ipmr_notifier_ops_template, net); + if (IS_ERR(ops)) + return PTR_ERR(ops); + net->ipv4.ipmr_notifier_ops = ops; + + return 0; +} + +static void __net_exit ipmr_notifier_exit(struct net *net) +{ + fib_notifier_ops_unregister(net->ipv4.ipmr_notifier_ops); + net->ipv4.ipmr_notifier_ops = NULL; +} + /* Setup for IP multicast routing */ static int __net_init ipmr_net_init(struct net *net) { int err; + err = ipmr_notifier_init(net); + if (err) + goto ipmr_notifier_fail; + err = ipmr_rules_init(net); if (err < 0) - goto fail; + goto ipmr_rules_fail; #ifdef CONFIG_PROC_FS err = -ENOMEM; @@ -3072,7 +3310,9 @@ proc_cache_fail: proc_vif_fail: ipmr_rules_exit(net); #endif -fail: +ipmr_rules_fail: + ipmr_notifier_exit(net); +ipmr_notifier_fail: return err; } @@ -3082,6 +3322,7 @@ static void __net_exit ipmr_net_exit(struct net *net) remove_proc_entry("ip_mr_cache", net->proc_net); remove_proc_entry("ip_mr_vif", net->proc_net); #endif + ipmr_notifier_exit(net); ipmr_rules_exit(net); } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 94d4cd2d5ea4..1c7ed77968c9 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1520,43 +1520,56 @@ struct rtable *rt_dst_alloc(struct net_device *dev, EXPORT_SYMBOL(rt_dst_alloc); /* called in rcu_read_lock() section */ -static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, - u8 tos, struct net_device *dev, int our) +int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, + u8 tos, struct net_device *dev, + struct in_device *in_dev, u32 *itag) { - struct rtable *rth; - struct in_device *in_dev = __in_dev_get_rcu(dev); - unsigned int flags = RTCF_MULTICAST; - u32 itag = 0; int err; /* Primary sanity checks. */ - if (!in_dev) return -EINVAL; if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || skb->protocol != htons(ETH_P_IP)) - goto e_inval; + return -EINVAL; if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev)) - goto e_inval; + return -EINVAL; if (ipv4_is_zeronet(saddr)) { if (!ipv4_is_local_multicast(daddr)) - goto e_inval; + return -EINVAL; } else { err = fib_validate_source(skb, saddr, 0, tos, 0, dev, - in_dev, &itag); + in_dev, itag); if (err < 0) - goto e_err; + return err; } + return 0; +} + +/* called in rcu_read_lock() section */ +static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, + u8 tos, struct net_device *dev, int our) +{ + struct in_device *in_dev = __in_dev_get_rcu(dev); + unsigned int flags = RTCF_MULTICAST; + struct rtable *rth; + u32 itag = 0; + int err; + + err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag); + if (err) + return err; + if (our) flags |= RTCF_LOCAL; rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST, IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false); if (!rth) - goto e_nobufs; + return -ENOBUFS; #ifdef CONFIG_IP_ROUTE_CLASSID rth->dst.tclassid = itag; @@ -1572,13 +1585,6 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, skb_dst_set(skb, &rth->dst); return 0; - -e_nobufs: - return -ENOBUFS; -e_inval: - return -EINVAL; -e_err: - return err; } @@ -3032,7 +3038,6 @@ struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; int __init ip_rt_init(void) { - int rc = 0; int cpu; ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL); @@ -3089,7 +3094,7 @@ int __init ip_rt_init(void) #endif register_pernet_subsys(&rt_genid_ops); register_pernet_subsys(&ipv4_inetpeer_ops); - return rc; + return 0; } #ifdef CONFIG_SYSCTL diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 0d3c038d7b04..cac8dd309f39 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -251,10 +251,12 @@ static int proc_allowed_congestion_control(struct ctl_table *ctl, return ret; } -static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write, +static int proc_tcp_fastopen_key(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { + struct net *net = container_of(table->data, struct net, + ipv4.sysctl_tcp_fastopen); struct ctl_table tbl = { .maxlen = (TCP_FASTOPEN_KEY_LENGTH * 2 + 10) }; struct tcp_fastopen_context *ctxt; int ret; @@ -265,7 +267,7 @@ static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write, return -ENOMEM; rcu_read_lock(); - ctxt = rcu_dereference(tcp_fastopen_ctx); + ctxt = rcu_dereference(net->ipv4.tcp_fastopen_ctx); if (ctxt) memcpy(user_key, ctxt->key, TCP_FASTOPEN_KEY_LENGTH); else @@ -282,12 +284,7 @@ static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write, ret = -EINVAL; goto bad_key; } - /* Generate a dummy secret but don't publish it. This - * is needed so we don't regenerate a new key on the - * first invocation of tcp_fastopen_cookie_gen - */ - tcp_fastopen_init_key_once(false); - tcp_fastopen_reset_cipher(user_key, TCP_FASTOPEN_KEY_LENGTH); + tcp_fastopen_reset_cipher(net, user_key, TCP_FASTOPEN_KEY_LENGTH); } bad_key: @@ -358,11 +355,13 @@ static int proc_tfo_blackhole_detect_timeout(struct ctl_table *table, void __user *buffer, size_t *lenp, loff_t *ppos) { + struct net *net = container_of(table->data, struct net, + ipv4.sysctl_tcp_fastopen_blackhole_timeout); int ret; ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (write && ret == 0) - tcp_fastopen_active_timeout_reset(); + atomic_set(&net->ipv4.tfo_active_disable_times, 0); return ret; } @@ -401,27 +400,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .procname = "tcp_fastopen", - .data = &sysctl_tcp_fastopen, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "tcp_fastopen_key", - .mode = 0600, - .maxlen = ((TCP_FASTOPEN_KEY_LENGTH * 2) + 10), - .proc_handler = proc_tcp_fastopen_key, - }, - { - .procname = "tcp_fastopen_blackhole_timeout_sec", - .data = &sysctl_tcp_fastopen_blackhole_timeout, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_tfo_blackhole_detect_timeout, - .extra1 = &zero, - }, - { .procname = "tcp_abort_on_overflow", .data = &sysctl_tcp_abort_on_overflow, .maxlen = sizeof(int), @@ -1085,6 +1063,28 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "tcp_fastopen", + .data = &init_net.ipv4.sysctl_tcp_fastopen, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "tcp_fastopen_key", + .mode = 0600, + .data = &init_net.ipv4.sysctl_tcp_fastopen, + .maxlen = ((TCP_FASTOPEN_KEY_LENGTH * 2) + 10), + .proc_handler = proc_tcp_fastopen_key, + }, + { + .procname = "tcp_fastopen_blackhole_timeout_sec", + .data = &init_net.ipv4.sysctl_tcp_fastopen_blackhole_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_tfo_blackhole_detect_timeout, + .extra1 = &zero, + }, #ifdef CONFIG_IP_ROUTE_MULTIPATH { .procname = "fib_multipath_use_neigh", diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 5091402720ab..8cf742fd4f99 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -415,6 +415,7 @@ void tcp_init_sock(struct sock *sk) tp->out_of_order_queue = RB_ROOT; tcp_init_xmit_timers(sk); INIT_LIST_HEAD(&tp->tsq_node); + INIT_LIST_HEAD(&tp->tsorted_sent_queue); icsk->icsk_rto = TCP_TIMEOUT_INIT; tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); @@ -456,6 +457,18 @@ void tcp_init_sock(struct sock *sk) } EXPORT_SYMBOL(tcp_init_sock); +void tcp_init_transfer(struct sock *sk, int bpf_op) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + tcp_mtup_init(sk); + icsk->icsk_af_ops->rebuild_header(sk); + tcp_init_metrics(sk); + tcp_call_bpf(sk, bpf_op); + tcp_init_congestion_control(sk); + tcp_init_buffer_space(sk); +} + static void tcp_tx_timestamp(struct sock *sk, u16 tsflags, struct sk_buff *skb) { if (tsflags && skb) { @@ -869,6 +882,7 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp, * available to the caller, no more, no less. */ skb->reserved_tailroom = skb->end - skb->tail - size; + INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); return skb; } __kfree_skb(skb); @@ -1126,7 +1140,7 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, struct sockaddr *uaddr = msg->msg_name; int err, flags; - if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) || + if (!(sock_net(sk)->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) || (uaddr && msg->msg_namelen >= sizeof(uaddr->sa_family) && uaddr->sa_family == AF_UNSPEC)) return -EOPNOTSUPP; @@ -2749,7 +2763,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, case TCP_FASTOPEN: if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) { - tcp_fastopen_init_key_once(true); + tcp_fastopen_init_key_once(net); fastopen_queue_tune(sk, val); } else { @@ -2759,7 +2773,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, case TCP_FASTOPEN_CONNECT: if (val > 1 || val < 0) { err = -EINVAL; - } else if (sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) { + } else if (net->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) { if (sk->sk_state == TCP_CLOSE) tp->fastopen_connect = val; else diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index e3c33220c418..29fff14d5a53 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -9,15 +9,18 @@ #include <net/inetpeer.h> #include <net/tcp.h> -int sysctl_tcp_fastopen __read_mostly = TFO_CLIENT_ENABLE; - -struct tcp_fastopen_context __rcu *tcp_fastopen_ctx; - -static DEFINE_SPINLOCK(tcp_fastopen_ctx_lock); - -void tcp_fastopen_init_key_once(bool publish) +void tcp_fastopen_init_key_once(struct net *net) { - static u8 key[TCP_FASTOPEN_KEY_LENGTH]; + u8 key[TCP_FASTOPEN_KEY_LENGTH]; + struct tcp_fastopen_context *ctxt; + + rcu_read_lock(); + ctxt = rcu_dereference(net->ipv4.tcp_fastopen_ctx); + if (ctxt) { + rcu_read_unlock(); + return; + } + rcu_read_unlock(); /* tcp_fastopen_reset_cipher publishes the new context * atomically, so we allow this race happening here. @@ -25,8 +28,8 @@ void tcp_fastopen_init_key_once(bool publish) * All call sites of tcp_fastopen_cookie_gen also check * for a valid cookie, so this is an acceptable risk. */ - if (net_get_random_once(key, sizeof(key)) && publish) - tcp_fastopen_reset_cipher(key, sizeof(key)); + get_random_bytes(key, sizeof(key)); + tcp_fastopen_reset_cipher(net, key, sizeof(key)); } static void tcp_fastopen_ctx_free(struct rcu_head *head) @@ -37,7 +40,22 @@ static void tcp_fastopen_ctx_free(struct rcu_head *head) kfree(ctx); } -int tcp_fastopen_reset_cipher(void *key, unsigned int len) +void tcp_fastopen_ctx_destroy(struct net *net) +{ + struct tcp_fastopen_context *ctxt; + + spin_lock(&net->ipv4.tcp_fastopen_ctx_lock); + + ctxt = rcu_dereference_protected(net->ipv4.tcp_fastopen_ctx, + lockdep_is_held(&net->ipv4.tcp_fastopen_ctx_lock)); + rcu_assign_pointer(net->ipv4.tcp_fastopen_ctx, NULL); + spin_unlock(&net->ipv4.tcp_fastopen_ctx_lock); + + if (ctxt) + call_rcu(&ctxt->rcu, tcp_fastopen_ctx_free); +} + +int tcp_fastopen_reset_cipher(struct net *net, void *key, unsigned int len) { int err; struct tcp_fastopen_context *ctx, *octx; @@ -61,26 +79,27 @@ error: kfree(ctx); } memcpy(ctx->key, key, len); - spin_lock(&tcp_fastopen_ctx_lock); + spin_lock(&net->ipv4.tcp_fastopen_ctx_lock); - octx = rcu_dereference_protected(tcp_fastopen_ctx, - lockdep_is_held(&tcp_fastopen_ctx_lock)); - rcu_assign_pointer(tcp_fastopen_ctx, ctx); - spin_unlock(&tcp_fastopen_ctx_lock); + octx = rcu_dereference_protected(net->ipv4.tcp_fastopen_ctx, + lockdep_is_held(&net->ipv4.tcp_fastopen_ctx_lock)); + rcu_assign_pointer(net->ipv4.tcp_fastopen_ctx, ctx); + spin_unlock(&net->ipv4.tcp_fastopen_ctx_lock); if (octx) call_rcu(&octx->rcu, tcp_fastopen_ctx_free); return err; } -static bool __tcp_fastopen_cookie_gen(const void *path, +static bool __tcp_fastopen_cookie_gen(struct net *net, + const void *path, struct tcp_fastopen_cookie *foc) { struct tcp_fastopen_context *ctx; bool ok = false; rcu_read_lock(); - ctx = rcu_dereference(tcp_fastopen_ctx); + ctx = rcu_dereference(net->ipv4.tcp_fastopen_ctx); if (ctx) { crypto_cipher_encrypt_one(ctx->tfm, foc->val, path); foc->len = TCP_FASTOPEN_COOKIE_SIZE; @@ -96,7 +115,8 @@ static bool __tcp_fastopen_cookie_gen(const void *path, * * XXX (TFO) - refactor when TCP_FASTOPEN_COOKIE_SIZE != AES_BLOCK_SIZE. */ -static bool tcp_fastopen_cookie_gen(struct request_sock *req, +static bool tcp_fastopen_cookie_gen(struct net *net, + struct request_sock *req, struct sk_buff *syn, struct tcp_fastopen_cookie *foc) { @@ -104,7 +124,7 @@ static bool tcp_fastopen_cookie_gen(struct request_sock *req, const struct iphdr *iph = ip_hdr(syn); __be32 path[4] = { iph->saddr, iph->daddr, 0, 0 }; - return __tcp_fastopen_cookie_gen(path, foc); + return __tcp_fastopen_cookie_gen(net, path, foc); } #if IS_ENABLED(CONFIG_IPV6) @@ -112,13 +132,13 @@ static bool tcp_fastopen_cookie_gen(struct request_sock *req, const struct ipv6hdr *ip6h = ipv6_hdr(syn); struct tcp_fastopen_cookie tmp; - if (__tcp_fastopen_cookie_gen(&ip6h->saddr, &tmp)) { + if (__tcp_fastopen_cookie_gen(net, &ip6h->saddr, &tmp)) { struct in6_addr *buf = &tmp.addr; int i; for (i = 0; i < 4; i++) buf->s6_addr32[i] ^= ip6h->daddr.s6_addr32[i]; - return __tcp_fastopen_cookie_gen(buf, foc); + return __tcp_fastopen_cookie_gen(net, buf, foc); } } #endif @@ -216,12 +236,7 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, refcount_set(&req->rsk_refcnt, 2); /* Now finish processing the fastopen child socket. */ - inet_csk(child)->icsk_af_ops->rebuild_header(child); - tcp_init_congestion_control(child); - tcp_mtup_init(child); - tcp_init_metrics(child); - tcp_call_bpf(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); - tcp_init_buffer_space(child); + tcp_init_transfer(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; @@ -279,25 +294,26 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct tcp_fastopen_cookie *foc) { - struct tcp_fastopen_cookie valid_foc = { .len = -1 }; bool syn_data = TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1; + int tcp_fastopen = sock_net(sk)->ipv4.sysctl_tcp_fastopen; + struct tcp_fastopen_cookie valid_foc = { .len = -1 }; struct sock *child; if (foc->len == 0) /* Client requests a cookie */ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENCOOKIEREQD); - if (!((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) && + if (!((tcp_fastopen & TFO_SERVER_ENABLE) && (syn_data || foc->len >= 0) && tcp_fastopen_queue_check(sk))) { foc->len = -1; return NULL; } - if (syn_data && (sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD)) + if (syn_data && (tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD)) goto fastopen; if (foc->len >= 0 && /* Client presents or requests a cookie */ - tcp_fastopen_cookie_gen(req, skb, &valid_foc) && + tcp_fastopen_cookie_gen(sock_net(sk), req, skb, &valid_foc) && foc->len == TCP_FASTOPEN_COOKIE_SIZE && foc->len == valid_foc.len && !memcmp(foc->val, valid_foc.val, foc->len)) { @@ -347,7 +363,7 @@ bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss, return false; } - if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE) { + if (sock_net(sk)->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE) { cookie->len = -1; return true; } @@ -401,25 +417,16 @@ EXPORT_SYMBOL(tcp_fastopen_defer_connect); * TFO connection with data exchanges. */ -/* Default to 1hr */ -unsigned int sysctl_tcp_fastopen_blackhole_timeout __read_mostly = 60 * 60; -static atomic_t tfo_active_disable_times __read_mostly = ATOMIC_INIT(0); -static unsigned long tfo_active_disable_stamp __read_mostly; - /* Disable active TFO and record current jiffies and * tfo_active_disable_times */ void tcp_fastopen_active_disable(struct sock *sk) { - atomic_inc(&tfo_active_disable_times); - tfo_active_disable_stamp = jiffies; - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENBLACKHOLE); -} + struct net *net = sock_net(sk); -/* Reset tfo_active_disable_times to 0 */ -void tcp_fastopen_active_timeout_reset(void) -{ - atomic_set(&tfo_active_disable_times, 0); + atomic_inc(&net->ipv4.tfo_active_disable_times); + net->ipv4.tfo_active_disable_stamp = jiffies; + NET_INC_STATS(net, LINUX_MIB_TCPFASTOPENBLACKHOLE); } /* Calculate timeout for tfo active disable @@ -428,17 +435,18 @@ void tcp_fastopen_active_timeout_reset(void) */ bool tcp_fastopen_active_should_disable(struct sock *sk) { - int tfo_da_times = atomic_read(&tfo_active_disable_times); - int multiplier; + unsigned int tfo_bh_timeout = sock_net(sk)->ipv4.sysctl_tcp_fastopen_blackhole_timeout; + int tfo_da_times = atomic_read(&sock_net(sk)->ipv4.tfo_active_disable_times); unsigned long timeout; + int multiplier; if (!tfo_da_times) return false; /* Limit timout to max: 2^6 * initial timeout */ multiplier = 1 << min(tfo_da_times - 1, 6); - timeout = multiplier * sysctl_tcp_fastopen_blackhole_timeout * HZ; - if (time_before(jiffies, tfo_active_disable_stamp + timeout)) + timeout = multiplier * tfo_bh_timeout * HZ; + if (time_before(jiffies, sock_net(sk)->ipv4.tfo_active_disable_stamp + timeout)) return true; /* Mark check bit so we can check for successful active TFO @@ -474,10 +482,10 @@ void tcp_fastopen_active_disable_ofo_check(struct sock *sk) } } } else if (tp->syn_fastopen_ch && - atomic_read(&tfo_active_disable_times)) { + atomic_read(&sock_net(sk)->ipv4.tfo_active_disable_times)) { dst = sk_dst_get(sk); if (!(dst && dst->dev && (dst->dev->flags & IFF_LOOPBACK))) - tcp_fastopen_active_timeout_reset(); + atomic_set(&sock_net(sk)->ipv4.tfo_active_disable_times, 0); dst_release(dst); } } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index bddf724f5c02..fb0d7ed84b94 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1593,6 +1593,8 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, tcp_skb_pcount(skb), skb->skb_mstamp); tcp_rate_skb_delivered(sk, skb, state->rate); + if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) + list_del_init(&skb->tcp_tsorted_anchor); if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp))) @@ -3054,8 +3056,11 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, shinfo = skb_shinfo(skb); if (!before(shinfo->tskey, prior_snd_una) && - before(shinfo->tskey, tcp_sk(sk)->snd_una)) - __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK); + before(shinfo->tskey, tcp_sk(sk)->snd_una)) { + tcp_skb_tsorted_save(skb) { + __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK); + } tcp_skb_tsorted_restore(skb); + } } /* Remove acknowledged frames from the retransmission queue. If our packet @@ -4266,11 +4271,6 @@ static void tcp_sack_remove(struct tcp_sock *tp) tp->rx_opt.num_sacks = num_sacks; } -enum tcp_queue { - OOO_QUEUE, - RCV_QUEUE, -}; - /** * tcp_try_coalesce - try to merge skb to prior one * @sk: socket @@ -4286,7 +4286,6 @@ enum tcp_queue { * Returns true if caller should free @from instead of queueing it */ static bool tcp_try_coalesce(struct sock *sk, - enum tcp_queue dest, struct sk_buff *to, struct sk_buff *from, bool *fragstolen) @@ -4311,10 +4310,7 @@ static bool tcp_try_coalesce(struct sock *sk, if (TCP_SKB_CB(from)->has_rxtstamp) { TCP_SKB_CB(to)->has_rxtstamp = true; - if (dest == OOO_QUEUE) - TCP_SKB_CB(to)->swtstamp = TCP_SKB_CB(from)->swtstamp; - else - to->tstamp = from->tstamp; + to->tstamp = from->tstamp; } return true; @@ -4351,9 +4347,6 @@ static void tcp_ofo_queue(struct sock *sk) } p = rb_next(p); rb_erase(&skb->rbnode, &tp->out_of_order_queue); - /* Replace tstamp which was stomped by rbnode */ - if (TCP_SKB_CB(skb)->has_rxtstamp) - skb->tstamp = TCP_SKB_CB(skb)->swtstamp; if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) { SOCK_DEBUG(sk, "ofo packet was already received\n"); @@ -4365,8 +4358,7 @@ static void tcp_ofo_queue(struct sock *sk) TCP_SKB_CB(skb)->end_seq); tail = skb_peek_tail(&sk->sk_receive_queue); - eaten = tail && tcp_try_coalesce(sk, RCV_QUEUE, - tail, skb, &fragstolen); + eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen); tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; if (!eaten) @@ -4420,10 +4412,6 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) return; } - /* Stash tstamp to avoid being stomped on by rbnode */ - if (TCP_SKB_CB(skb)->has_rxtstamp) - TCP_SKB_CB(skb)->swtstamp = skb->tstamp; - /* Disable header prediction. */ tp->pred_flags = 0; inet_csk_schedule_ack(sk); @@ -4451,7 +4439,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) /* In the typical case, we are adding an skb to the end of the list. * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup. */ - if (tcp_try_coalesce(sk, OOO_QUEUE, tp->ooo_last_skb, + if (tcp_try_coalesce(sk, tp->ooo_last_skb, skb, &fragstolen)) { coalesce_done: tcp_grow_window(sk, skb); @@ -4502,7 +4490,7 @@ coalesce_done: __kfree_skb(skb1); goto merge_right; } - } else if (tcp_try_coalesce(sk, OOO_QUEUE, skb1, + } else if (tcp_try_coalesce(sk, skb1, skb, &fragstolen)) { goto coalesce_done; } @@ -4554,7 +4542,7 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int __skb_pull(skb, hdrlen); eaten = (tail && - tcp_try_coalesce(sk, RCV_QUEUE, tail, + tcp_try_coalesce(sk, tail, skb, fragstolen)) ? 1 : 0; tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq); if (!eaten) { @@ -5530,20 +5518,13 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) security_inet_conn_established(sk, skb); } - /* Make sure socket is routed, for correct metrics. */ - icsk->icsk_af_ops->rebuild_header(sk); - - tcp_init_metrics(sk); - tcp_call_bpf(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB); - tcp_init_congestion_control(sk); + tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB); /* Prevent spurious tcp_cwnd_restart() on first data * packet. */ tp->lsndtime = tcp_jiffies32; - tcp_init_buffer_space(sk); - if (sock_flag(sk, SOCK_KEEPOPEN)) inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp)); @@ -5710,7 +5691,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, if (tcp_is_sack(tp) && sysctl_tcp_fack) tcp_enable_fack(tp); - tcp_mtup_init(sk); tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); tcp_initialize_rcv_mss(sk); @@ -5936,15 +5916,18 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) if (req) { inet_csk(sk)->icsk_retransmits = 0; reqsk_fastopen_remove(sk, req, false); + /* Re-arm the timer because data may have been sent out. + * This is similar to the regular data transmission case + * when new data has just been ack'ed. + * + * (TFO) - we could try to be more aggressive and + * retransmitting any data sooner based on when they + * are sent out. + */ + tcp_rearm_rto(sk); } else { - /* Make sure socket is routed, for correct metrics. */ - icsk->icsk_af_ops->rebuild_header(sk); - tcp_call_bpf(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); - tcp_init_congestion_control(sk); - - tcp_mtup_init(sk); + tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); tp->copied_seq = tp->rcv_nxt; - tcp_init_buffer_space(sk); } smp_mb(); tcp_set_state(sk, TCP_ESTABLISHED); @@ -5964,19 +5947,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) if (tp->rx_opt.tstamp_ok) tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; - if (req) { - /* Re-arm the timer because data may have been sent out. - * This is similar to the regular data transmission case - * when new data has just been ack'ed. - * - * (TFO) - we could try to be more aggressive and - * retransmitting any data sooner based on when they - * are sent out. - */ - tcp_rearm_rto(sk); - } else - tcp_init_metrics(sk); - if (!inet_csk(sk)->icsk_ca_ops->cong_control) tcp_update_pacing_rate(sk); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index d9416b5162bc..c7460fd90884 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1503,23 +1503,23 @@ csum_err: } EXPORT_SYMBOL(tcp_v4_do_rcv); -void tcp_v4_early_demux(struct sk_buff *skb) +int tcp_v4_early_demux(struct sk_buff *skb) { const struct iphdr *iph; const struct tcphdr *th; struct sock *sk; if (skb->pkt_type != PACKET_HOST) - return; + return 0; if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) - return; + return 0; iph = ip_hdr(skb); th = tcp_hdr(skb); if (th->doff < sizeof(struct tcphdr) / 4) - return; + return 0; sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo, iph->saddr, th->source, @@ -1538,6 +1538,7 @@ void tcp_v4_early_demux(struct sk_buff *skb) skb_dst_set_noref(skb, dst); } } + return 0; } bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) @@ -2472,6 +2473,11 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_window_scaling = 1; net->ipv4.sysctl_tcp_timestamps = 1; + net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; + spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); + net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60; + atomic_set(&net->ipv4.tfo_active_disable_times, 0); + return 0; fail: tcp_sk_exit(net); @@ -2481,7 +2487,12 @@ fail: static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) { + struct net *net; + inet_twsk_purge(&tcp_hashinfo, AF_INET); + + list_for_each_entry(net, net_exit_list, exit_list) + tcp_fastopen_ctx_destroy(net); } static struct pernet_operations __net_initdata tcp_sk_ops = { diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 102b2c90bb80..0ab78abc811b 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -892,10 +892,14 @@ static void tcp_metrics_flush_all(struct net *net) for (row = 0; row < max_rows; row++, hb++) { struct tcp_metrics_block __rcu **pp; + bool match; + spin_lock_bh(&tcp_metrics_lock); pp = &hb->chain; for (tm = deref_locked(*pp); tm; tm = deref_locked(*pp)) { - if (net_eq(tm_net(tm), net)) { + match = net ? net_eq(tm_net(tm), net) : + !atomic_read(&tm_net(tm)->count); + if (match) { *pp = tm->tcpm_next; kfree_rcu(tm, rcu_head); } else { @@ -1018,14 +1022,14 @@ static int __net_init tcp_net_metrics_init(struct net *net) return 0; } -static void __net_exit tcp_net_metrics_exit(struct net *net) +static void __net_exit tcp_net_metrics_exit_batch(struct list_head *net_exit_list) { - tcp_metrics_flush_all(net); + tcp_metrics_flush_all(NULL); } static __net_initdata struct pernet_operations tcp_net_metrics_ops = { - .init = tcp_net_metrics_init, - .exit = tcp_net_metrics_exit, + .init = tcp_net_metrics_init, + .exit_batch = tcp_net_metrics_exit_batch, }; void __init tcp_metrics_init(void) diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 188a6f31356d..2341b9f857b6 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -446,6 +446,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1; INIT_LIST_HEAD(&newtp->tsq_node); + INIT_LIST_HEAD(&newtp->tsorted_sent_queue); tcp_init_wl(newtp, treq->rcv_isn); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 1c839c99114c..8162e2880178 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -971,6 +971,12 @@ static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb) HRTIMER_MODE_ABS_PINNED); } +static void tcp_update_skb_after_send(struct tcp_sock *tp, struct sk_buff *skb) +{ + skb->skb_mstamp = tp->tcp_mstamp; + list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); +} + /* This routine actually transmits TCP packets queued in by * tcp_do_sendmsg(). This is used by both the initial * transmission and possible later retransmissions. @@ -1003,10 +1009,14 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq - tp->snd_una; oskb = skb; - if (unlikely(skb_cloned(skb))) - skb = pskb_copy(skb, gfp_mask); - else - skb = skb_clone(skb, gfp_mask); + + tcp_skb_tsorted_save(oskb) { + if (unlikely(skb_cloned(oskb))) + skb = pskb_copy(oskb, gfp_mask); + else + skb = skb_clone(oskb, gfp_mask); + } tcp_skb_tsorted_restore(oskb); + if (unlikely(!skb)) return -ENOBUFS; } @@ -1127,7 +1137,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, err = net_xmit_eval(err); } if (!err && oskb) { - oskb->skb_mstamp = tp->tcp_mstamp; + tcp_update_skb_after_send(tp, oskb); tcp_rate_skb_sent(sk, oskb); } return err; @@ -1328,6 +1338,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, /* Link BUFF into the send queue. */ __skb_header_release(buff); tcp_insert_write_queue_after(skb, buff, sk); + list_add(&buff->tcp_tsorted_anchor, &skb->tcp_tsorted_anchor); return 0; } @@ -1806,40 +1817,6 @@ static bool tcp_snd_wnd_test(const struct tcp_sock *tp, return !after(end_seq, tcp_wnd_end(tp)); } -/* This checks if the data bearing packet SKB (usually tcp_send_head(sk)) - * should be put on the wire right now. If so, it returns the number of - * packets allowed by the congestion window. - */ -static unsigned int tcp_snd_test(const struct sock *sk, struct sk_buff *skb, - unsigned int cur_mss, int nonagle) -{ - const struct tcp_sock *tp = tcp_sk(sk); - unsigned int cwnd_quota; - - tcp_init_tso_segs(skb, cur_mss); - - if (!tcp_nagle_test(tp, skb, cur_mss, nonagle)) - return 0; - - cwnd_quota = tcp_cwnd_test(tp, skb); - if (cwnd_quota && !tcp_snd_wnd_test(tp, skb, cur_mss)) - cwnd_quota = 0; - - return cwnd_quota; -} - -/* Test if sending is allowed right now. */ -bool tcp_may_send_now(struct sock *sk) -{ - const struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb = tcp_send_head(sk); - - return skb && - tcp_snd_test(sk, skb, tcp_current_mss(sk), - (tcp_skb_is_last(sk, skb) ? - tp->nonagle : TCP_NAGLE_PUSH)); -} - /* Trim TSO SKB to LEN bytes, put the remaining data into a new packet * which is put after SKB on the list. It is very much like * tcp_fragment() except that it may make several kinds of assumptions @@ -2294,7 +2271,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { /* "skb_mstamp" is used as a start point for the retransmit timer */ - skb->skb_mstamp = tp->tcp_mstamp; + tcp_update_skb_after_send(tp, skb); goto repair; /* Skip network transmission */ } @@ -2872,11 +2849,14 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) skb_headroom(skb) >= 0xFFFF)) { struct sk_buff *nskb; - nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC); - err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) : - -ENOBUFS; + tcp_skb_tsorted_save(skb) { + nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC); + err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) : + -ENOBUFS; + } tcp_skb_tsorted_restore(skb); + if (!err) - skb->skb_mstamp = tp->tcp_mstamp; + tcp_update_skb_after_send(tp, skb); } else { err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } @@ -3057,6 +3037,7 @@ coalesce: goto coalesce; return; } + INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); skb_reserve(skb, MAX_TCP_HEADER); sk_forced_mem_schedule(sk, skb->truesize); /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ @@ -3112,9 +3093,14 @@ int tcp_send_synack(struct sock *sk) } if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) { if (skb_cloned(skb)) { - struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC); + struct sk_buff *nskb; + + tcp_skb_tsorted_save(skb) { + nskb = skb_copy(skb, GFP_ATOMIC); + } tcp_skb_tsorted_restore(skb); if (!nskb) return -ENOMEM; + INIT_LIST_HEAD(&nskb->tcp_tsorted_anchor); tcp_unlink_write_queue(skb, sk); __skb_header_release(nskb); __tcp_add_write_queue_head(sk, nskb); @@ -3423,6 +3409,10 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) goto done; } + /* data was not sent, this is our new send_head */ + sk->sk_send_head = syn_data; + tp->packets_out -= tcp_skb_pcount(syn_data); + fallback: /* Send a regular SYN with Fast Open cookie request option */ if (fo->cookie.len > 0) @@ -3475,6 +3465,11 @@ int tcp_connect(struct sock *sk) */ tp->snd_nxt = tp->write_seq; tp->pushed_seq = tp->write_seq; + buff = tcp_send_head(sk); + if (unlikely(buff)) { + tp->snd_nxt = TCP_SKB_CB(buff)->seq; + tp->pushed_seq = TCP_SKB_CB(buff)->seq; + } TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS); /* Timer for repeating the SYN until an answer. */ diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index 449cd914d58e..cda6074a429a 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -45,7 +45,7 @@ static bool tcp_rack_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2) static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout) { struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb; + struct sk_buff *skb, *n; u32 reo_wnd; *reo_timeout = 0; @@ -58,45 +58,31 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout) if ((tp->rack.reord || !tp->lost_out) && tcp_min_rtt(tp) != ~0U) reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd); - tcp_for_write_queue(skb, sk) { + list_for_each_entry_safe(skb, n, &tp->tsorted_sent_queue, + tcp_tsorted_anchor) { struct tcp_skb_cb *scb = TCP_SKB_CB(skb); + s32 remaining; - if (skb == tcp_send_head(sk)) - break; - - /* Skip ones already (s)acked */ - if (!after(scb->end_seq, tp->snd_una) || - scb->sacked & TCPCB_SACKED_ACKED) + /* Skip ones marked lost but not yet retransmitted */ + if ((scb->sacked & TCPCB_LOST) && + !(scb->sacked & TCPCB_SACKED_RETRANS)) continue; - if (tcp_rack_sent_after(tp->rack.mstamp, skb->skb_mstamp, - tp->rack.end_seq, scb->end_seq)) { - /* Step 3 in draft-cheng-tcpm-rack-00.txt: - * A packet is lost if its elapsed time is beyond - * the recent RTT plus the reordering window. - */ - u32 elapsed = tcp_stamp_us_delta(tp->tcp_mstamp, - skb->skb_mstamp); - s32 remaining = tp->rack.rtt_us + reo_wnd - elapsed; - - if (remaining < 0) { - tcp_rack_mark_skb_lost(sk, skb); - continue; - } - - /* Skip ones marked lost but not yet retransmitted */ - if ((scb->sacked & TCPCB_LOST) && - !(scb->sacked & TCPCB_SACKED_RETRANS)) - continue; + if (!tcp_rack_sent_after(tp->rack.mstamp, skb->skb_mstamp, + tp->rack.end_seq, scb->end_seq)) + break; + /* A packet is lost if it has not been s/acked beyond + * the recent RTT plus the reordering window. + */ + remaining = tp->rack.rtt_us + reo_wnd - + tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp); + if (remaining < 0) { + tcp_rack_mark_skb_lost(sk, skb); + list_del_init(&skb->tcp_tsorted_anchor); + } else { /* Record maximum wait time (+1 to avoid 0) */ *reo_timeout = max_t(u32, *reo_timeout, 1 + remaining); - - } else if (!(scb->sacked & TCPCB_RETRANS)) { - /* Original data are sent sequentially so stop early - * b/c the rest are all sent after rack_sent - */ - break; } } } diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index 218cfcc77650..ee113ff15fd0 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -158,7 +158,7 @@ EXPORT_SYMBOL_GPL(tcp_vegas_cwnd_event); static inline u32 tcp_vegas_ssthresh(struct tcp_sock *tp) { - return min(tp->snd_ssthresh, tp->snd_cwnd-1); + return min(tp->snd_ssthresh, tp->snd_cwnd); } static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index ef29df8648e4..eb0359bdaa11 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1212,8 +1212,7 @@ static void udp_rmem_release(struct sock *sk, int size, int partial, if (likely(partial)) { up->forward_deficit += size; size = up->forward_deficit; - if (size < (sk->sk_rcvbuf >> 2) && - !skb_queue_empty(&up->reader_queue)) + if (size < (sk->sk_rcvbuf >> 2)) return; } else { size += up->forward_deficit; @@ -2221,9 +2220,10 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net, return NULL; } -void udp_v4_early_demux(struct sk_buff *skb) +int udp_v4_early_demux(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); + struct in_device *in_dev = NULL; const struct iphdr *iph; const struct udphdr *uh; struct sock *sk = NULL; @@ -2234,24 +2234,24 @@ void udp_v4_early_demux(struct sk_buff *skb) /* validate the packet */ if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct udphdr))) - return; + return 0; iph = ip_hdr(skb); uh = udp_hdr(skb); if (skb->pkt_type == PACKET_BROADCAST || skb->pkt_type == PACKET_MULTICAST) { - struct in_device *in_dev = __in_dev_get_rcu(skb->dev); + in_dev = __in_dev_get_rcu(skb->dev); if (!in_dev) - return; + return 0; /* we are supposed to accept bcast packets */ if (skb->pkt_type == PACKET_MULTICAST) { ours = ip_check_mc_rcu(in_dev, iph->daddr, iph->saddr, iph->protocol); if (!ours) - return; + return 0; } sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr, @@ -2263,7 +2263,7 @@ void udp_v4_early_demux(struct sk_buff *skb) } if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt)) - return; + return 0; skb->sk = sk; skb->destructor = sock_efree; @@ -2272,12 +2272,23 @@ void udp_v4_early_demux(struct sk_buff *skb) if (dst) dst = dst_check(dst, 0); if (dst) { + u32 itag = 0; + /* set noref for now. * any place which wants to hold dst has to call * dst_hold_safe() */ skb_dst_set_noref(skb, dst); + + /* for unconnected multicast sockets we need to validate + * the source on each packet + */ + if (!inet_sk(sk)->inet_daddr && in_dev) + return ip_mc_validate_source(skb, iph->daddr, + iph->saddr, iph->tos, + skb->dev, in_dev, &itag); } + return 0; } int udp_rcv(struct sk_buff *skb) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index c2e2a78787ec..837418ff2d4b 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -303,10 +303,10 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { .disable_policy = 0, }; -/* Check if a valid qdisc is available */ -static inline bool addrconf_qdisc_ok(const struct net_device *dev) +/* Check if link is ready: is it up and is a valid qdisc available */ +static inline bool addrconf_link_ready(const struct net_device *dev) { - return !qdisc_tx_is_noop(dev); + return netif_oper_up(dev) && !qdisc_tx_is_noop(dev); } static void addrconf_del_rs_timer(struct inet6_dev *idev) @@ -451,7 +451,7 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev) ndev->token = in6addr_any; - if (netif_running(dev) && addrconf_qdisc_ok(dev)) + if (netif_running(dev) && addrconf_link_ready(dev)) ndev->if_flags |= IF_READY; ipv6_mc_init_dev(ndev); @@ -1399,10 +1399,18 @@ static inline int ipv6_saddr_preferred(int type) return 0; } -static inline bool ipv6_use_optimistic_addr(struct inet6_dev *idev) +static bool ipv6_use_optimistic_addr(struct net *net, + struct inet6_dev *idev) { #ifdef CONFIG_IPV6_OPTIMISTIC_DAD - return idev && idev->cnf.optimistic_dad && idev->cnf.use_optimistic; + if (!idev) + return false; + if (!net->ipv6.devconf_all->optimistic_dad && !idev->cnf.optimistic_dad) + return false; + if (!net->ipv6.devconf_all->use_optimistic && !idev->cnf.use_optimistic) + return false; + + return true; #else return false; #endif @@ -1472,7 +1480,7 @@ static int ipv6_get_saddr_eval(struct net *net, /* Rule 3: Avoid deprecated and optimistic addresses */ u8 avoid = IFA_F_DEPRECATED; - if (!ipv6_use_optimistic_addr(score->ifa->idev)) + if (!ipv6_use_optimistic_addr(net, score->ifa->idev)) avoid |= IFA_F_OPTIMISTIC; ret = ipv6_saddr_preferred(score->addr_type) || !(score->ifa->flags & avoid); @@ -2460,7 +2468,8 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev, int max_addresses = in6_dev->cnf.max_addresses; #ifdef CONFIG_IPV6_OPTIMISTIC_DAD - if (in6_dev->cnf.optimistic_dad && + if ((net->ipv6.devconf_all->optimistic_dad || + in6_dev->cnf.optimistic_dad) && !net->ipv6.devconf_all->forwarding && sllao) addr_flags |= IFA_F_OPTIMISTIC; #endif @@ -3051,7 +3060,8 @@ void addrconf_add_linklocal(struct inet6_dev *idev, u32 addr_flags = flags | IFA_F_PERMANENT; #ifdef CONFIG_IPV6_OPTIMISTIC_DAD - if (idev->cnf.optimistic_dad && + if ((dev_net(idev->dev)->ipv6.devconf_all->optimistic_dad || + idev->cnf.optimistic_dad) && !dev_net(idev->dev)->ipv6.devconf_all->forwarding) addr_flags |= IFA_F_OPTIMISTIC; #endif @@ -3287,7 +3297,7 @@ static int fixup_permanent_addr(struct inet6_dev *idev, struct rt6_info *rt, *prev; rt = addrconf_dst_alloc(idev, &ifp->addr, false); - if (unlikely(IS_ERR(rt))) + if (IS_ERR(rt)) return PTR_ERR(rt); /* ifp->rt can be accessed outside of rtnl */ @@ -3393,7 +3403,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, /* restore routes for permanent addresses */ addrconf_permanent_addr(dev); - if (!addrconf_qdisc_ok(dev)) { + if (!addrconf_link_ready(dev)) { /* device is not ready yet. */ pr_info("ADDRCONF(NETDEV_UP): %s: link is not ready\n", dev->name); @@ -3408,7 +3418,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, run_pending = 1; } } else if (event == NETDEV_CHANGE) { - if (!addrconf_qdisc_ok(dev)) { + if (!addrconf_link_ready(dev)) { /* device is still not ready. */ break; } @@ -3810,6 +3820,7 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp) goto out; if (dev->flags&(IFF_NOARP|IFF_LOOPBACK) || + dev_net(dev)->ipv6.devconf_all->accept_dad < 1 || idev->cnf.accept_dad < 1 || !(ifp->flags&IFA_F_TENTATIVE) || ifp->flags & IFA_F_NODAD) { @@ -3841,7 +3852,7 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp) */ if (ifp->flags & IFA_F_OPTIMISTIC) { ip6_ins_rt(ifp->rt); - if (ipv6_use_optimistic_addr(idev)) { + if (ipv6_use_optimistic_addr(dev_net(dev), idev)) { /* Because optimistic nodes can use this address, * notify listeners. If DAD fails, RTM_DELADDR is sent. */ @@ -3897,7 +3908,9 @@ static void addrconf_dad_work(struct work_struct *w) action = DAD_ABORT; ifp->state = INET6_IFADDR_STATE_POSTDAD; - if (idev->cnf.accept_dad > 1 && !idev->cnf.disable_ipv6 && + if ((dev_net(idev->dev)->ipv6.devconf_all->accept_dad > 1 || + idev->cnf.accept_dad > 1) && + !idev->cnf.disable_ipv6 && !(ifp->flags & IFA_F_STABLE_PRIVACY)) { struct in6_addr addr; @@ -4940,9 +4953,10 @@ static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa) /* Don't send DELADDR notification for TENTATIVE address, * since NEWADDR notification is sent only after removing - * TENTATIVE flag. + * TENTATIVE flag, if DAD has not failed. */ - if (ifa->flags & IFA_F_TENTATIVE && event == RTM_DELADDR) + if (ifa->flags & IFA_F_TENTATIVE && !(ifa->flags & IFA_F_DADFAILED) && + event == RTM_DELADDR) return; skb = nlmsg_new(inet6_ifaddr_msgsize(), GFP_ATOMIC); @@ -6604,9 +6618,9 @@ void addrconf_cleanup(void) unregister_pernet_subsys(&addrconf_ops); ipv6_addr_label_cleanup(); - rtnl_lock(); + rtnl_af_unregister(&inet6_ops); - __rtnl_af_unregister(&inet6_ops); + rtnl_lock(); /* clean dev list */ for_each_netdev(&init_net, dev) { diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c index b055bc79f56d..c6311d7108f6 100644 --- a/net/ipv6/addrlabel.c +++ b/net/ipv6/addrlabel.c @@ -30,7 +30,6 @@ * Policy Table */ struct ip6addrlbl_entry { - possible_net_t lbl_net; struct in6_addr prefix; int prefixlen; int ifindex; @@ -41,19 +40,6 @@ struct ip6addrlbl_entry { struct rcu_head rcu; }; -static struct ip6addrlbl_table -{ - struct hlist_head head; - spinlock_t lock; - u32 seq; -} ip6addrlbl_table; - -static inline -struct net *ip6addrlbl_net(const struct ip6addrlbl_entry *lbl) -{ - return read_pnet(&lbl->lbl_net); -} - /* * Default policy table (RFC6724 + extensions) * @@ -148,13 +134,10 @@ static inline void ip6addrlbl_put(struct ip6addrlbl_entry *p) } /* Find label */ -static bool __ip6addrlbl_match(struct net *net, - const struct ip6addrlbl_entry *p, +static bool __ip6addrlbl_match(const struct ip6addrlbl_entry *p, const struct in6_addr *addr, int addrtype, int ifindex) { - if (!net_eq(ip6addrlbl_net(p), net)) - return false; if (p->ifindex && p->ifindex != ifindex) return false; if (p->addrtype && p->addrtype != addrtype) @@ -169,8 +152,9 @@ static struct ip6addrlbl_entry *__ipv6_addr_label(struct net *net, int type, int ifindex) { struct ip6addrlbl_entry *p; - hlist_for_each_entry_rcu(p, &ip6addrlbl_table.head, list) { - if (__ip6addrlbl_match(net, p, addr, type, ifindex)) + + hlist_for_each_entry_rcu(p, &net->ipv6.ip6addrlbl_table.head, list) { + if (__ip6addrlbl_match(p, addr, type, ifindex)) return p; } return NULL; @@ -196,8 +180,7 @@ u32 ipv6_addr_label(struct net *net, } /* allocate one entry */ -static struct ip6addrlbl_entry *ip6addrlbl_alloc(struct net *net, - const struct in6_addr *prefix, +static struct ip6addrlbl_entry *ip6addrlbl_alloc(const struct in6_addr *prefix, int prefixlen, int ifindex, u32 label) { @@ -236,24 +219,23 @@ static struct ip6addrlbl_entry *ip6addrlbl_alloc(struct net *net, newp->addrtype = addrtype; newp->label = label; INIT_HLIST_NODE(&newp->list); - write_pnet(&newp->lbl_net, net); refcount_set(&newp->refcnt, 1); return newp; } /* add a label */ -static int __ip6addrlbl_add(struct ip6addrlbl_entry *newp, int replace) +static int __ip6addrlbl_add(struct net *net, struct ip6addrlbl_entry *newp, + int replace) { - struct hlist_node *n; struct ip6addrlbl_entry *last = NULL, *p = NULL; + struct hlist_node *n; int ret = 0; ADDRLABEL(KERN_DEBUG "%s(newp=%p, replace=%d)\n", __func__, newp, replace); - hlist_for_each_entry_safe(p, n, &ip6addrlbl_table.head, list) { + hlist_for_each_entry_safe(p, n, &net->ipv6.ip6addrlbl_table.head, list) { if (p->prefixlen == newp->prefixlen && - net_eq(ip6addrlbl_net(p), ip6addrlbl_net(newp)) && p->ifindex == newp->ifindex && ipv6_addr_equal(&p->prefix, &newp->prefix)) { if (!replace) { @@ -273,10 +255,10 @@ static int __ip6addrlbl_add(struct ip6addrlbl_entry *newp, int replace) if (last) hlist_add_behind_rcu(&newp->list, &last->list); else - hlist_add_head_rcu(&newp->list, &ip6addrlbl_table.head); + hlist_add_head_rcu(&newp->list, &net->ipv6.ip6addrlbl_table.head); out: if (!ret) - ip6addrlbl_table.seq++; + net->ipv6.ip6addrlbl_table.seq++; return ret; } @@ -292,12 +274,12 @@ static int ip6addrlbl_add(struct net *net, __func__, prefix, prefixlen, ifindex, (unsigned int)label, replace); - newp = ip6addrlbl_alloc(net, prefix, prefixlen, ifindex, label); + newp = ip6addrlbl_alloc(prefix, prefixlen, ifindex, label); if (IS_ERR(newp)) return PTR_ERR(newp); - spin_lock(&ip6addrlbl_table.lock); - ret = __ip6addrlbl_add(newp, replace); - spin_unlock(&ip6addrlbl_table.lock); + spin_lock(&net->ipv6.ip6addrlbl_table.lock); + ret = __ip6addrlbl_add(net, newp, replace); + spin_unlock(&net->ipv6.ip6addrlbl_table.lock); if (ret) ip6addrlbl_free(newp); return ret; @@ -315,9 +297,8 @@ static int __ip6addrlbl_del(struct net *net, ADDRLABEL(KERN_DEBUG "%s(prefix=%pI6, prefixlen=%d, ifindex=%d)\n", __func__, prefix, prefixlen, ifindex); - hlist_for_each_entry_safe(p, n, &ip6addrlbl_table.head, list) { + hlist_for_each_entry_safe(p, n, &net->ipv6.ip6addrlbl_table.head, list) { if (p->prefixlen == prefixlen && - net_eq(ip6addrlbl_net(p), net) && p->ifindex == ifindex && ipv6_addr_equal(&p->prefix, prefix)) { hlist_del_rcu(&p->list); @@ -340,9 +321,9 @@ static int ip6addrlbl_del(struct net *net, __func__, prefix, prefixlen, ifindex); ipv6_addr_prefix(&prefix_buf, prefix, prefixlen); - spin_lock(&ip6addrlbl_table.lock); + spin_lock(&net->ipv6.ip6addrlbl_table.lock); ret = __ip6addrlbl_del(net, &prefix_buf, prefixlen, ifindex); - spin_unlock(&ip6addrlbl_table.lock); + spin_unlock(&net->ipv6.ip6addrlbl_table.lock); return ret; } @@ -354,6 +335,9 @@ static int __net_init ip6addrlbl_net_init(struct net *net) ADDRLABEL(KERN_DEBUG "%s\n", __func__); + spin_lock_init(&net->ipv6.ip6addrlbl_table.lock); + INIT_HLIST_HEAD(&net->ipv6.ip6addrlbl_table.head); + for (i = 0; i < ARRAY_SIZE(ip6addrlbl_init_table); i++) { int ret = ip6addrlbl_add(net, ip6addrlbl_init_table[i].prefix, @@ -373,14 +357,12 @@ static void __net_exit ip6addrlbl_net_exit(struct net *net) struct hlist_node *n; /* Remove all labels belonging to the exiting net */ - spin_lock(&ip6addrlbl_table.lock); - hlist_for_each_entry_safe(p, n, &ip6addrlbl_table.head, list) { - if (net_eq(ip6addrlbl_net(p), net)) { - hlist_del_rcu(&p->list); - ip6addrlbl_put(p); - } + spin_lock(&net->ipv6.ip6addrlbl_table.lock); + hlist_for_each_entry_safe(p, n, &net->ipv6.ip6addrlbl_table.head, list) { + hlist_del_rcu(&p->list); + ip6addrlbl_put(p); } - spin_unlock(&ip6addrlbl_table.lock); + spin_unlock(&net->ipv6.ip6addrlbl_table.lock); } static struct pernet_operations ipv6_addr_label_ops = { @@ -390,8 +372,6 @@ static struct pernet_operations ipv6_addr_label_ops = { int __init ipv6_addr_label_init(void) { - spin_lock_init(&ip6addrlbl_table.lock); - return register_pernet_subsys(&ipv6_addr_label_ops); } @@ -510,11 +490,10 @@ static int ip6addrlbl_dump(struct sk_buff *skb, struct netlink_callback *cb) int err; rcu_read_lock(); - hlist_for_each_entry_rcu(p, &ip6addrlbl_table.head, list) { - if (idx >= s_idx && - net_eq(ip6addrlbl_net(p), net)) { + hlist_for_each_entry_rcu(p, &net->ipv6.ip6addrlbl_table.head, list) { + if (idx >= s_idx) { err = ip6addrlbl_fill(skb, p, - ip6addrlbl_table.seq, + net->ipv6.ip6addrlbl_table.seq, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWADDRLABEL, @@ -571,7 +550,7 @@ static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, p = __ipv6_addr_label(net, addr, ipv6_addr_type(addr), ifal->ifal_index); if (p && !ip6addrlbl_hold(p)) p = NULL; - lseq = ip6addrlbl_table.seq; + lseq = net->ipv6.ip6addrlbl_table.seq; rcu_read_unlock(); if (!p) { diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c index 305e2ed730bf..11025f8d124b 100644 --- a/net/ipv6/exthdrs_core.c +++ b/net/ipv6/exthdrs_core.c @@ -99,7 +99,7 @@ int ipv6_skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp, break; hdrlen = 8; } else if (nexthdr == NEXTHDR_AUTH) - hdrlen = (hp->hdrlen+2)<<2; + hdrlen = ipv6_authlen(hp); else hdrlen = ipv6_optlen(hp); @@ -187,7 +187,6 @@ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, { unsigned int start = skb_network_offset(skb) + sizeof(struct ipv6hdr); u8 nexthdr = ipv6_hdr(skb)->nexthdr; - unsigned int len; bool found; if (fragoff) @@ -204,7 +203,6 @@ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, start = *offset + sizeof(struct ipv6hdr); nexthdr = ip6->nexthdr; } - len = skb->len - start; do { struct ipv6_opt_hdr _hdr, *hp; @@ -273,7 +271,6 @@ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, if (!found) { nexthdr = hp->nexthdr; - len -= hdrlen; start += hdrlen; } } while (!found); diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 5acb54405b10..aeb49b4d8c7d 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -255,7 +255,6 @@ int icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, { struct sk_buff *skb; struct icmp6hdr *icmp6h; - int err = 0; skb = skb_peek(&sk->sk_write_queue); if (!skb) @@ -288,7 +287,7 @@ int icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, } ip6_push_pending_frames(sk); out: - return err; + return 0; } struct icmpv6_msg { diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index b7a72d409334..241841fb479c 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -940,24 +940,25 @@ done: } static int ip6gre_header(struct sk_buff *skb, struct net_device *dev, - unsigned short type, - const void *daddr, const void *saddr, unsigned int len) + unsigned short type, const void *daddr, + const void *saddr, unsigned int len) { struct ip6_tnl *t = netdev_priv(dev); - struct ipv6hdr *ipv6h = skb_push(skb, t->hlen); - __be16 *p = (__be16 *)(ipv6h+1); + struct ipv6hdr *ipv6h; + __be16 *p; - ip6_flow_hdr(ipv6h, 0, - ip6_make_flowlabel(dev_net(dev), skb, - t->fl.u.ip6.flowlabel, true, - &t->fl.u.ip6)); + ipv6h = skb_push(skb, t->hlen + sizeof(*ipv6h)); + ip6_flow_hdr(ipv6h, 0, ip6_make_flowlabel(dev_net(dev), skb, + t->fl.u.ip6.flowlabel, + true, &t->fl.u.ip6)); ipv6h->hop_limit = t->parms.hop_limit; ipv6h->nexthdr = NEXTHDR_GRE; ipv6h->saddr = t->parms.laddr; ipv6h->daddr = t->parms.raddr; - p[0] = t->parms.o_flags; - p[1] = htons(type); + p = (__be16 *)(ipv6h + 1); + p[0] = t->parms.o_flags; + p[1] = htons(type); /* * Set the source hardware address. @@ -1155,19 +1156,21 @@ err_alloc_dev: return err; } -static void __net_exit ip6gre_exit_net(struct net *net) +static void __net_exit ip6gre_exit_batch_net(struct list_head *net_list) { + struct net *net; LIST_HEAD(list); rtnl_lock(); - ip6gre_destroy_tunnels(net, &list); + list_for_each_entry(net, net_list, exit_list) + ip6gre_destroy_tunnels(net, &list); unregister_netdevice_many(&list); rtnl_unlock(); } static struct pernet_operations ip6gre_net_ops = { .init = ip6gre_init_net, - .exit = ip6gre_exit_net, + .exit_batch = ip6gre_exit_batch_net, .id = &ip6gre_net_id, .size = sizeof(struct ip6gre_net), }; @@ -1310,6 +1313,7 @@ static void ip6gre_tap_setup(struct net_device *dev) dev->features |= NETIF_F_NETNS_LOCAL; dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + netif_keep_dst(dev); } static bool ip6gre_netlink_encap_parms(struct nlattr *data[], diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index ae73164559d5..825548680b1e 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1043,6 +1043,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, struct dst_entry *dst = NULL, *ndst = NULL; struct net_device *tdev; int mtu; + unsigned int eth_hlen = t->dev->type == ARPHRD_ETHER ? ETH_HLEN : 0; unsigned int psh_hlen = sizeof(struct ipv6hdr) + t->encap_hlen; unsigned int max_headroom = psh_hlen; bool use_cache = false; @@ -1124,7 +1125,7 @@ route_lookup: t->parms.name); goto tx_err_dst_release; } - mtu = dst_mtu(dst) - psh_hlen - t->tun_hlen; + mtu = dst_mtu(dst) - eth_hlen - psh_hlen - t->tun_hlen; if (encap_limit >= 0) { max_headroom += 8; mtu -= 8; @@ -1133,7 +1134,7 @@ route_lookup: mtu = IPV6_MIN_MTU; if (skb_dst(skb) && !t->parms.collect_md) skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); - if (skb->len - t->tun_hlen > mtu && !skb_is_gso(skb)) { + if (skb->len - t->tun_hlen - eth_hlen > mtu && !skb_is_gso(skb)) { *pmtu = mtu; err = -EMSGSIZE; goto tx_err_dst_release; @@ -2167,17 +2168,16 @@ static struct xfrm6_tunnel ip6ip6_handler __read_mostly = { .priority = 1, }; -static void __net_exit ip6_tnl_destroy_tunnels(struct net *net) +static void __net_exit ip6_tnl_destroy_tunnels(struct net *net, struct list_head *list) { struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); struct net_device *dev, *aux; int h; struct ip6_tnl *t; - LIST_HEAD(list); for_each_netdev_safe(net, dev, aux) if (dev->rtnl_link_ops == &ip6_link_ops) - unregister_netdevice_queue(dev, &list); + unregister_netdevice_queue(dev, list); for (h = 0; h < IP6_TUNNEL_HASH_SIZE; h++) { t = rtnl_dereference(ip6n->tnls_r_l[h]); @@ -2186,12 +2186,10 @@ static void __net_exit ip6_tnl_destroy_tunnels(struct net *net) * been added to the list by the previous loop. */ if (!net_eq(dev_net(t->dev), net)) - unregister_netdevice_queue(t->dev, &list); + unregister_netdevice_queue(t->dev, list); t = rtnl_dereference(t->next); } } - - unregister_netdevice_many(&list); } static int __net_init ip6_tnl_init_net(struct net *net) @@ -2235,16 +2233,21 @@ err_alloc_dev: return err; } -static void __net_exit ip6_tnl_exit_net(struct net *net) +static void __net_exit ip6_tnl_exit_batch_net(struct list_head *net_list) { + struct net *net; + LIST_HEAD(list); + rtnl_lock(); - ip6_tnl_destroy_tunnels(net); + list_for_each_entry(net, net_list, exit_list) + ip6_tnl_destroy_tunnels(net, &list); + unregister_netdevice_many(&list); rtnl_unlock(); } static struct pernet_operations ip6_tnl_net_ops = { .init = ip6_tnl_init_net, - .exit = ip6_tnl_exit_net, + .exit_batch = ip6_tnl_exit_batch_net, .id = &ip6_tnl_net_id, .size = sizeof(struct ip6_tnl_net), }; @@ -2259,6 +2262,9 @@ static int __init ip6_tunnel_init(void) { int err; + if (!ipv6_mod_enabled()) + return -EOPNOTSUPP; + err = register_pernet_device(&ip6_tnl_net_ops); if (err < 0) goto out_pernet; diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 79444a4bfd6d..dbb74f3c57a7 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -445,6 +445,7 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) struct dst_entry *dst = skb_dst(skb); struct net_device *tdev; struct xfrm_state *x; + int pkt_len = skb->len; int err = -1; int mtu; @@ -502,7 +503,7 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats); u64_stats_update_begin(&tstats->syncp); - tstats->tx_bytes += skb->len; + tstats->tx_bytes += pkt_len; tstats->tx_packets++; u64_stats_update_end(&tstats->syncp); } else { @@ -1052,23 +1053,22 @@ static struct rtnl_link_ops vti6_link_ops __read_mostly = { .get_link_net = ip6_tnl_get_link_net, }; -static void __net_exit vti6_destroy_tunnels(struct vti6_net *ip6n) +static void __net_exit vti6_destroy_tunnels(struct vti6_net *ip6n, + struct list_head *list) { int h; struct ip6_tnl *t; - LIST_HEAD(list); for (h = 0; h < IP6_VTI_HASH_SIZE; h++) { t = rtnl_dereference(ip6n->tnls_r_l[h]); while (t) { - unregister_netdevice_queue(t->dev, &list); + unregister_netdevice_queue(t->dev, list); t = rtnl_dereference(t->next); } } t = rtnl_dereference(ip6n->tnls_wc[0]); - unregister_netdevice_queue(t->dev, &list); - unregister_netdevice_many(&list); + unregister_netdevice_queue(t->dev, list); } static int __net_init vti6_init_net(struct net *net) @@ -1108,18 +1108,24 @@ err_alloc_dev: return err; } -static void __net_exit vti6_exit_net(struct net *net) +static void __net_exit vti6_exit_batch_net(struct list_head *net_list) { - struct vti6_net *ip6n = net_generic(net, vti6_net_id); + struct vti6_net *ip6n; + struct net *net; + LIST_HEAD(list); rtnl_lock(); - vti6_destroy_tunnels(ip6n); + list_for_each_entry(net, net_list, exit_list) { + ip6n = net_generic(net, vti6_net_id); + vti6_destroy_tunnels(ip6n, &list); + } + unregister_netdevice_many(&list); rtnl_unlock(); } static struct pernet_operations vti6_net_ops = { .init = vti6_init_net, - .exit = vti6_exit_net, + .exit_batch = vti6_exit_batch_net, .id = &vti6_net_id, .size = sizeof(struct vti6_net), }; diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index a5e466d4e093..b9404feabd78 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -377,6 +377,14 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, retv = 0; break; + case IPV6_FREEBIND: + if (optlen < sizeof(int)) + goto e_inval; + /* we also don't have a separate freebind bit for IPV6 */ + inet_sk(sk)->freebind = valbool; + retv = 0; + break; + case IPV6_RECVORIGDSTADDR: if (optlen < sizeof(int)) goto e_inval; @@ -1214,6 +1222,10 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, val = inet_sk(sk)->transparent; break; + case IPV6_FREEBIND: + val = inet_sk(sk)->freebind; + break; + case IPV6_RECVORIGDSTADDR: val = np->rxopt.bits.rxorigdstaddr; break; diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index ac912bb21747..a799f5258614 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -1848,19 +1848,22 @@ err_alloc_dev: return err; } -static void __net_exit sit_exit_net(struct net *net) +static void __net_exit sit_exit_batch_net(struct list_head *net_list) { LIST_HEAD(list); + struct net *net; rtnl_lock(); - sit_destroy_tunnels(net, &list); + list_for_each_entry(net, net_list, exit_list) + sit_destroy_tunnels(net, &list); + unregister_netdevice_many(&list); rtnl_unlock(); } static struct pernet_operations sit_net_ops = { .init = sit_init_net, - .exit = sit_exit_net, + .exit_batch = sit_exit_batch_net, .id = &sit_net_id, .size = sizeof(struct sit_net), }; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index e2ecfb137297..40d7234c27b9 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1015,6 +1015,7 @@ static void udp6_hwcsum_outgoing(struct sock *sk, struct sk_buff *skb, */ offset = skb_transport_offset(skb); skb->csum = skb_checksum(skb, offset, skb->len - offset, 0); + csum = skb->csum; skb->ip_summed = CHECKSUM_NONE; diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index af4e76ac88ff..0b750a22c4b9 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -1650,7 +1650,7 @@ static int kcm_clone(struct socket *osock, struct kcm_clone *info, } newfile = sock_alloc_file(newsock, 0, osock->sk->sk_prot_creator->name); - if (unlikely(IS_ERR(newfile))) { + if (IS_ERR(newfile)) { err = PTR_ERR(newfile); goto out_sock_alloc_fail; } diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index ee485df73ccd..02d61101b108 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1314,6 +1314,9 @@ again: hlist_del_init(&session->hlist); + if (test_and_set_bit(0, &session->dead)) + goto again; + if (session->ref != NULL) (*session->ref)(session); @@ -1685,14 +1688,12 @@ EXPORT_SYMBOL_GPL(l2tp_tunnel_create); /* This function is used by the netlink TUNNEL_DELETE command. */ -int l2tp_tunnel_delete(struct l2tp_tunnel *tunnel) +void l2tp_tunnel_delete(struct l2tp_tunnel *tunnel) { - l2tp_tunnel_inc_refcount(tunnel); - if (false == queue_work(l2tp_wq, &tunnel->del_work)) { - l2tp_tunnel_dec_refcount(tunnel); - return 1; + if (!test_and_set_bit(0, &tunnel->dead)) { + l2tp_tunnel_inc_refcount(tunnel); + queue_work(l2tp_wq, &tunnel->del_work); } - return 0; } EXPORT_SYMBOL_GPL(l2tp_tunnel_delete); @@ -1750,6 +1751,9 @@ EXPORT_SYMBOL_GPL(__l2tp_session_unhash); */ int l2tp_session_delete(struct l2tp_session *session) { + if (test_and_set_bit(0, &session->dead)) + return 0; + if (session->ref) (*session->ref)(session); __l2tp_session_unhash(session); diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index a305e0c5925a..67c79d9b5c6c 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -76,6 +76,7 @@ struct l2tp_session_cfg { struct l2tp_session { int magic; /* should be * L2TP_SESSION_MAGIC */ + long dead; struct l2tp_tunnel *tunnel; /* back pointer to tunnel * context */ @@ -160,6 +161,9 @@ struct l2tp_tunnel_cfg { struct l2tp_tunnel { int magic; /* Should be L2TP_TUNNEL_MAGIC */ + + unsigned long dead; + struct rcu_head rcu; rwlock_t hlist_lock; /* protect session_hlist */ bool acpt_newsess; /* Indicates whether this @@ -254,7 +258,7 @@ int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32 peer_tunnel_id, struct l2tp_tunnel_cfg *cfg, struct l2tp_tunnel **tunnelp); void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel); -int l2tp_tunnel_delete(struct l2tp_tunnel *tunnel); +void l2tp_tunnel_delete(struct l2tp_tunnel *tunnel); struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunnel, u32 session_id, u32 peer_session_id, diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c index 87da9ef61860..014a7bc2a872 100644 --- a/net/l2tp/l2tp_eth.c +++ b/net/l2tp/l2tp_eth.c @@ -44,7 +44,6 @@ struct l2tp_eth { struct net_device *dev; struct sock *tunnel_sock; struct l2tp_session *session; - struct list_head list; atomic_long_t tx_bytes; atomic_long_t tx_packets; atomic_long_t tx_dropped; @@ -58,17 +57,6 @@ struct l2tp_eth_sess { struct net_device *dev; }; -/* per-net private data for this module */ -static unsigned int l2tp_eth_net_id; -struct l2tp_eth_net { - struct list_head l2tp_eth_dev_list; - spinlock_t l2tp_eth_lock; -}; - -static inline struct l2tp_eth_net *l2tp_eth_pernet(struct net *net) -{ - return net_generic(net, l2tp_eth_net_id); -} static int l2tp_eth_dev_init(struct net_device *dev) { @@ -84,12 +72,6 @@ static int l2tp_eth_dev_init(struct net_device *dev) static void l2tp_eth_dev_uninit(struct net_device *dev) { - struct l2tp_eth *priv = netdev_priv(dev); - struct l2tp_eth_net *pn = l2tp_eth_pernet(dev_net(dev)); - - spin_lock(&pn->l2tp_eth_lock); - list_del_init(&priv->list); - spin_unlock(&pn->l2tp_eth_lock); dev_put(dev); } @@ -273,7 +255,6 @@ static int l2tp_eth_create(struct net *net, struct l2tp_tunnel *tunnel, struct l2tp_eth *priv; struct l2tp_eth_sess *spriv; int rc; - struct l2tp_eth_net *pn; if (cfg->ifname) { strlcpy(name, cfg->ifname, IFNAMSIZ); @@ -305,7 +286,6 @@ static int l2tp_eth_create(struct net *net, struct l2tp_tunnel *tunnel, priv = netdev_priv(dev); priv->dev = dev; priv->session = session; - INIT_LIST_HEAD(&priv->list); priv->tunnel_sock = tunnel->sock; session->recv_skb = l2tp_eth_dev_recv; @@ -326,10 +306,6 @@ static int l2tp_eth_create(struct net *net, struct l2tp_tunnel *tunnel, strlcpy(session->ifname, dev->name, IFNAMSIZ); dev_hold(dev); - pn = l2tp_eth_pernet(dev_net(dev)); - spin_lock(&pn->l2tp_eth_lock); - list_add(&priv->list, &pn->l2tp_eth_dev_list); - spin_unlock(&pn->l2tp_eth_lock); return 0; @@ -342,22 +318,6 @@ out: return rc; } -static __net_init int l2tp_eth_init_net(struct net *net) -{ - struct l2tp_eth_net *pn = net_generic(net, l2tp_eth_net_id); - - INIT_LIST_HEAD(&pn->l2tp_eth_dev_list); - spin_lock_init(&pn->l2tp_eth_lock); - - return 0; -} - -static struct pernet_operations l2tp_eth_net_ops = { - .init = l2tp_eth_init_net, - .id = &l2tp_eth_net_id, - .size = sizeof(struct l2tp_eth_net), -}; - static const struct l2tp_nl_cmd_ops l2tp_eth_nl_cmd_ops = { .session_create = l2tp_eth_create, @@ -371,25 +331,18 @@ static int __init l2tp_eth_init(void) err = l2tp_nl_register_ops(L2TP_PWTYPE_ETH, &l2tp_eth_nl_cmd_ops); if (err) - goto out; - - err = register_pernet_device(&l2tp_eth_net_ops); - if (err) - goto out_unreg; + goto err; pr_info("L2TP ethernet pseudowire support (L2TPv3)\n"); return 0; -out_unreg: - l2tp_nl_unregister_ops(L2TP_PWTYPE_ETH); -out: +err: return err; } static void __exit l2tp_eth_exit(void) { - unregister_pernet_device(&l2tp_eth_net_ops); l2tp_nl_unregister_ops(L2TP_PWTYPE_ETH); } diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 50e3ee9a9d61..bc6e8bfc5be4 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -437,11 +437,11 @@ static void pppol2tp_session_close(struct l2tp_session *session) BUG_ON(session->magic != L2TP_SESSION_MAGIC); - if (sock) { + if (sock) inet_shutdown(sock, SEND_SHUTDOWN); - /* Don't let the session go away before our socket does */ - l2tp_session_inc_refcount(session); - } + + /* Don't let the session go away before our socket does */ + l2tp_session_inc_refcount(session); } /* Really kill the session socket. (Called from sock_put() if diff --git a/net/mac802154/llsec.c b/net/mac802154/llsec.c index 1e1c9b20bab7..2fb703d70803 100644 --- a/net/mac802154/llsec.c +++ b/net/mac802154/llsec.c @@ -623,13 +623,18 @@ llsec_do_encrypt_unauth(struct sk_buff *skb, const struct mac802154_llsec *sec, u8 iv[16]; struct scatterlist src; SKCIPHER_REQUEST_ON_STACK(req, key->tfm0); - int err; + int err, datalen; + unsigned char *data; llsec_geniv(iv, sec->params.hwaddr, &hdr->sec); - sg_init_one(&src, skb->data, skb->len); + /* Compute data payload offset and data length */ + data = skb_mac_header(skb) + skb->mac_len; + datalen = skb_tail_pointer(skb) - data; + sg_init_one(&src, data, datalen); + skcipher_request_set_tfm(req, key->tfm0); skcipher_request_set_callback(req, 0, NULL, NULL); - skcipher_request_set_crypt(req, &src, &src, skb->len, iv); + skcipher_request_set_crypt(req, &src, &src, datalen, iv); err = crypto_skcipher_encrypt(req); skcipher_request_zero(req); return err; @@ -713,7 +718,8 @@ int mac802154_llsec_encrypt(struct mac802154_llsec *sec, struct sk_buff *skb) if (hlen < 0 || hdr.fc.type != IEEE802154_FC_TYPE_DATA) return -EINVAL; - if (!hdr.fc.security_enabled || hdr.sec.level == 0) { + if (!hdr.fc.security_enabled || + (hdr.sec.level == IEEE802154_SCF_SECLEVEL_NONE)) { skb_push(skb, hlen); return 0; } diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index f236c0bc7b3f..51063d9ed0f7 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -1041,12 +1041,24 @@ out: static int mtype_head(struct ip_set *set, struct sk_buff *skb) { - const struct htype *h = set->data; + struct htype *h = set->data; const struct htable *t; struct nlattr *nested; size_t memsize; u8 htable_bits; + /* If any members have expired, set->elements will be wrong + * mytype_expire function will update it with the right count. + * we do not hold set->lock here, so grab it first. + * set->elements can still be incorrect in the case of a huge set, + * because elements might time out during the listing. + */ + if (SET_WITH_TIMEOUT(set)) { + spin_lock_bh(&set->lock); + mtype_expire(set, h); + spin_unlock_bh(&set->lock); + } + rcu_read_lock_bh(); t = rcu_dereference_bh_nfnl(h->table); memsize = mtype_ahash_memsize(h, t) + set->ext_size; diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index f393a7086025..af8345fc4fbd 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -429,7 +429,7 @@ nf_nat_setup_info(struct nf_conn *ct, srchash = hash_by_src(net, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); - lock = &nf_nat_locks[srchash % ARRAY_SIZE(nf_nat_locks)]; + lock = &nf_nat_locks[srchash % CONNTRACK_LOCKS]; spin_lock_bh(lock); hlist_add_head_rcu(&ct->nat_bysource, &nf_nat_bysource[srchash]); @@ -532,9 +532,9 @@ static void __nf_nat_cleanup_conntrack(struct nf_conn *ct) unsigned int h; h = hash_by_src(nf_ct_net(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); - spin_lock_bh(&nf_nat_locks[h % ARRAY_SIZE(nf_nat_locks)]); + spin_lock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]); hlist_del_rcu(&ct->nat_bysource); - spin_unlock_bh(&nf_nat_locks[h % ARRAY_SIZE(nf_nat_locks)]); + spin_unlock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]); } static int nf_nat_proto_clean(struct nf_conn *ct, void *data) @@ -807,8 +807,8 @@ static int __init nf_nat_init(void) /* Leave them the same for the moment. */ nf_nat_htable_size = nf_conntrack_htable_size; - if (nf_nat_htable_size < ARRAY_SIZE(nf_nat_locks)) - nf_nat_htable_size = ARRAY_SIZE(nf_nat_locks); + if (nf_nat_htable_size < CONNTRACK_LOCKS) + nf_nat_htable_size = CONNTRACK_LOCKS; nf_nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 0); if (!nf_nat_bysource) @@ -821,7 +821,7 @@ static int __init nf_nat_init(void) return ret; } - for (i = 0; i < ARRAY_SIZE(nf_nat_locks); i++) + for (i = 0; i < CONNTRACK_LOCKS; i++) spin_lock_init(&nf_nat_locks[i]); nf_ct_helper_expectfn_register(&follow_master_nat); diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 327807731b44..94c11cf0459d 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2270,10 +2270,13 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, mutex_unlock(nlk->cb_mutex); + ret = 0; if (cb->start) - cb->start(cb); + ret = cb->start(cb); + + if (!ret) + ret = netlink_dump(sk); - ret = netlink_dump(sk); sock_put(sk); if (ret) diff --git a/net/nfc/core.c b/net/nfc/core.c index 5cf33df888c3..e5e23c2cbe74 100644 --- a/net/nfc/core.c +++ b/net/nfc/core.c @@ -1094,9 +1094,8 @@ struct nfc_dev *nfc_allocate_device(struct nfc_ops *ops, dev->targets_generation = 1; if (ops->check_presence) { - init_timer(&dev->check_pres_timer); - dev->check_pres_timer.data = (unsigned long)dev; - dev->check_pres_timer.function = nfc_check_pres_timeout; + setup_timer(&dev->check_pres_timer, nfc_check_pres_timeout, + (unsigned long)dev); INIT_WORK(&dev->check_pres_work, nfc_check_pres_work); } diff --git a/net/nfc/hci/core.c b/net/nfc/hci/core.c index b740fef0acc5..a8a6e7814e09 100644 --- a/net/nfc/hci/core.c +++ b/net/nfc/hci/core.c @@ -1004,9 +1004,8 @@ int nfc_hci_register_device(struct nfc_hci_dev *hdev) INIT_WORK(&hdev->msg_tx_work, nfc_hci_msg_tx_work); - init_timer(&hdev->cmd_timer); - hdev->cmd_timer.data = (unsigned long)hdev; - hdev->cmd_timer.function = nfc_hci_cmd_timeout; + setup_timer(&hdev->cmd_timer, nfc_hci_cmd_timeout, + (unsigned long)hdev); skb_queue_head_init(&hdev->rx_hcp_frags); diff --git a/net/nfc/hci/llc_shdlc.c b/net/nfc/hci/llc_shdlc.c index 17e59a009ce6..58df37eae1e8 100644 --- a/net/nfc/hci/llc_shdlc.c +++ b/net/nfc/hci/llc_shdlc.c @@ -763,17 +763,14 @@ static void *llc_shdlc_init(struct nfc_hci_dev *hdev, xmit_to_drv_t xmit_to_drv, mutex_init(&shdlc->state_mutex); shdlc->state = SHDLC_DISCONNECTED; - init_timer(&shdlc->connect_timer); - shdlc->connect_timer.data = (unsigned long)shdlc; - shdlc->connect_timer.function = llc_shdlc_connect_timeout; + setup_timer(&shdlc->connect_timer, llc_shdlc_connect_timeout, + (unsigned long)shdlc); - init_timer(&shdlc->t1_timer); - shdlc->t1_timer.data = (unsigned long)shdlc; - shdlc->t1_timer.function = llc_shdlc_t1_timeout; + setup_timer(&shdlc->t1_timer, llc_shdlc_t1_timeout, + (unsigned long)shdlc); - init_timer(&shdlc->t2_timer); - shdlc->t2_timer.data = (unsigned long)shdlc; - shdlc->t2_timer.function = llc_shdlc_t2_timeout; + setup_timer(&shdlc->t2_timer, llc_shdlc_t2_timeout, + (unsigned long)shdlc); shdlc->w = SHDLC_MAX_WINDOW; shdlc->srej_support = SHDLC_SREJ_SUPPORT; diff --git a/net/nfc/llcp_core.c b/net/nfc/llcp_core.c index 02eef5cf3cce..7988185072e5 100644 --- a/net/nfc/llcp_core.c +++ b/net/nfc/llcp_core.c @@ -1573,9 +1573,8 @@ int nfc_llcp_register_device(struct nfc_dev *ndev) INIT_LIST_HEAD(&local->list); kref_init(&local->ref); mutex_init(&local->sdp_lock); - init_timer(&local->link_timer); - local->link_timer.data = (unsigned long) local; - local->link_timer.function = nfc_llcp_symm_timer; + setup_timer(&local->link_timer, nfc_llcp_symm_timer, + (unsigned long)local); skb_queue_head_init(&local->tx_queue); INIT_WORK(&local->tx_work, nfc_llcp_tx_work); @@ -1601,9 +1600,8 @@ int nfc_llcp_register_device(struct nfc_dev *ndev) mutex_init(&local->sdreq_lock); INIT_HLIST_HEAD(&local->pending_sdreqs); - init_timer(&local->sdreq_timer); - local->sdreq_timer.data = (unsigned long) local; - local->sdreq_timer.function = nfc_llcp_sdreq_timer; + setup_timer(&local->sdreq_timer, nfc_llcp_sdreq_timer, + (unsigned long)local); INIT_WORK(&local->sdreq_timeout_work, nfc_llcp_sdreq_timeout_work); list_add(&local->list, &llcp_devices); diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c index 0389398fa4ab..2e5e7a41d8ef 100644 --- a/net/openvswitch/vport-netdev.c +++ b/net/openvswitch/vport-netdev.c @@ -108,7 +108,8 @@ struct vport *ovs_netdev_link(struct vport *vport, const char *name) rtnl_lock(); err = netdev_master_upper_dev_link(vport->dev, - get_dpdev(vport->dp), NULL, NULL); + get_dpdev(vport->dp), + NULL, NULL, NULL); if (err) goto error_unlock; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index c26172995511..3f5caa3fbd06 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -544,9 +544,7 @@ static void prb_init_blk_timer(struct packet_sock *po, struct tpacket_kbdq_core *pkc, void (*func) (unsigned long)) { - init_timer(&pkc->retire_blk_timer); - pkc->retire_blk_timer.data = (long)po; - pkc->retire_blk_timer.function = func; + setup_timer(&pkc->retire_blk_timer, func, (long)po); pkc->retire_blk_timer.expires = jiffies; } @@ -1684,10 +1682,6 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) mutex_lock(&fanout_mutex); - err = -EINVAL; - if (!po->running) - goto out; - err = -EALREADY; if (po->fanout) goto out; @@ -1749,7 +1743,10 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) list_add(&match->list, &fanout_list); } err = -EINVAL; - if (match->type == type && + + spin_lock(&po->bind_lock); + if (po->running && + match->type == type && match->prot_hook.type == po->prot_hook.type && match->prot_hook.dev == po->prot_hook.dev) { err = -ENOSPC; @@ -1761,6 +1758,13 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) err = 0; } } + spin_unlock(&po->bind_lock); + + if (err && !refcount_read(&match->sk_ref)) { + list_del(&match->list); + kfree(match); + } + out: if (err && rollover) { kfree(rollover); @@ -2834,6 +2838,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) struct virtio_net_hdr vnet_hdr = { 0 }; int offset = 0; struct packet_sock *po = pkt_sk(sk); + bool has_vnet_hdr = false; int hlen, tlen, linear; int extra_len = 0; @@ -2877,6 +2882,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) err = packet_snd_vnet_parse(msg, &len, &vnet_hdr); if (err) goto out_unlock; + has_vnet_hdr = true; } if (unlikely(sock_flag(sk, SOCK_NOFCS))) { @@ -2935,7 +2941,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) skb->priority = sk->sk_priority; skb->mark = sockc.mark; - if (po->has_vnet_hdr) { + if (has_vnet_hdr) { err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le()); if (err) goto out_free; @@ -3063,13 +3069,15 @@ static int packet_do_bind(struct sock *sk, const char *name, int ifindex, int ret = 0; bool unlisted = false; - if (po->fanout) - return -EINVAL; - lock_sock(sk); spin_lock(&po->bind_lock); rcu_read_lock(); + if (po->fanout) { + ret = -EINVAL; + goto out_unlock; + } + if (name) { dev = dev_get_by_name_rcu(sock_net(sk), name); if (!dev) { diff --git a/net/rds/ib.c b/net/rds/ib.c index a0954ace3774..36dd2099048a 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -126,6 +126,7 @@ void rds_ib_dev_put(struct rds_ib_device *rds_ibdev) static void rds_ib_add_one(struct ib_device *device) { struct rds_ib_device *rds_ibdev; + bool has_fr, has_fmr; /* Only handle IB (no iWARP) devices */ if (device->node_type != RDMA_NODE_IB_CA) @@ -143,11 +144,11 @@ static void rds_ib_add_one(struct ib_device *device) rds_ibdev->max_wrs = device->attrs.max_qp_wr; rds_ibdev->max_sge = min(device->attrs.max_sge, RDS_IB_MAX_SGE); - rds_ibdev->has_fr = (device->attrs.device_cap_flags & - IB_DEVICE_MEM_MGT_EXTENSIONS); - rds_ibdev->has_fmr = (device->alloc_fmr && device->dealloc_fmr && - device->map_phys_fmr && device->unmap_fmr); - rds_ibdev->use_fastreg = (rds_ibdev->has_fr && !rds_ibdev->has_fmr); + has_fr = (device->attrs.device_cap_flags & + IB_DEVICE_MEM_MGT_EXTENSIONS); + has_fmr = (device->alloc_fmr && device->dealloc_fmr && + device->map_phys_fmr && device->unmap_fmr); + rds_ibdev->use_fastreg = (has_fr && !has_fmr); rds_ibdev->fmr_max_remaps = device->attrs.max_map_per_fmr?: 32; rds_ibdev->max_1m_mrs = device->attrs.max_mr ? diff --git a/net/rds/ib.h b/net/rds/ib.h index bf4822407567..6ea6a27891b0 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -215,8 +215,6 @@ struct rds_ib_device { struct list_head conn_list; struct ib_device *dev; struct ib_pd *pd; - bool has_fmr; - bool has_fr; bool use_fastreg; unsigned int max_mrs; diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index 9a3c54e659e9..e678699268a2 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -601,11 +601,11 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev, if (pool_type == RDS_IB_MR_1M_POOL) { /* +1 allows for unaligned MRs */ pool->fmr_attr.max_pages = RDS_MR_1M_MSG_SIZE + 1; - pool->max_items = RDS_MR_1M_POOL_SIZE; + pool->max_items = rds_ibdev->max_1m_mrs; } else { /* pool_type == RDS_IB_MR_8K_POOL */ pool->fmr_attr.max_pages = RDS_MR_8K_MSG_SIZE + 1; - pool->max_items = RDS_MR_8K_POOL_SIZE; + pool->max_items = rds_ibdev->max_8k_mrs; } pool->max_free_pinned = pool->max_items * pool->fmr_attr.max_pages / 4; diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index c0c707eb2c96..5ef8ce8c83d4 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -49,11 +49,11 @@ static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act, filter = rcu_dereference(prog->filter); if (at_ingress) { __skb_push(skb, skb->mac_len); - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); filter_res = BPF_PROG_RUN(filter, skb); __skb_pull(skb, skb->mac_len); } else { - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); filter_res = BPF_PROG_RUN(filter, skb); } rcu_read_unlock(); diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c index d89ebafd2239..700b345b07f9 100644 --- a/net/sched/cls_basic.c +++ b/net/sched/cls_basic.c @@ -17,13 +17,14 @@ #include <linux/errno.h> #include <linux/rtnetlink.h> #include <linux/skbuff.h> +#include <linux/idr.h> #include <net/netlink.h> #include <net/act_api.h> #include <net/pkt_cls.h> struct basic_head { - u32 hgenerator; struct list_head flist; + struct idr handle_idr; struct rcu_head rcu; }; @@ -78,6 +79,7 @@ static int basic_init(struct tcf_proto *tp) if (head == NULL) return -ENOBUFS; INIT_LIST_HEAD(&head->flist); + idr_init(&head->handle_idr); rcu_assign_pointer(tp->root, head); return 0; } @@ -99,8 +101,10 @@ static void basic_destroy(struct tcf_proto *tp) list_for_each_entry_safe(f, n, &head->flist, link) { list_del_rcu(&f->link); tcf_unbind_filter(tp, &f->res); + idr_remove_ext(&head->handle_idr, f->handle); call_rcu(&f->rcu, basic_delete_filter); } + idr_destroy(&head->handle_idr); kfree_rcu(head, rcu); } @@ -111,6 +115,7 @@ static int basic_delete(struct tcf_proto *tp, void *arg, bool *last) list_del_rcu(&f->link); tcf_unbind_filter(tp, &f->res); + idr_remove_ext(&head->handle_idr, f->handle); call_rcu(&f->rcu, basic_delete_filter); *last = list_empty(&head->flist); return 0; @@ -154,6 +159,7 @@ static int basic_change(struct net *net, struct sk_buff *in_skb, struct nlattr *tb[TCA_BASIC_MAX + 1]; struct basic_filter *fold = (struct basic_filter *) *arg; struct basic_filter *fnew; + unsigned long idr_index; if (tca[TCA_OPTIONS] == NULL) return -EINVAL; @@ -176,33 +182,33 @@ static int basic_change(struct net *net, struct sk_buff *in_skb, if (err < 0) goto errout; - err = -EINVAL; if (handle) { fnew->handle = handle; - } else if (fold) { - fnew->handle = fold->handle; + if (!fold) { + err = idr_alloc_ext(&head->handle_idr, fnew, &idr_index, + handle, handle + 1, GFP_KERNEL); + if (err) + goto errout; + } } else { - unsigned int i = 0x80000000; - do { - if (++head->hgenerator == 0x7FFFFFFF) - head->hgenerator = 1; - } while (--i > 0 && basic_get(tp, head->hgenerator)); - - if (i <= 0) { - pr_err("Insufficient number of handles\n"); + err = idr_alloc_ext(&head->handle_idr, fnew, &idr_index, + 1, 0x7FFFFFFF, GFP_KERNEL); + if (err) goto errout; - } - - fnew->handle = head->hgenerator; + fnew->handle = idr_index; } err = basic_set_parms(net, tp, fnew, base, tb, tca[TCA_RATE], ovr); - if (err < 0) + if (err < 0) { + if (!fold) + idr_remove_ext(&head->handle_idr, fnew->handle); goto errout; + } *arg = fnew; if (fold) { + idr_replace_ext(&head->handle_idr, fnew, fnew->handle); list_replace_rcu(&fold->link, &fnew->link); tcf_unbind_filter(tp, &fold->res); call_rcu(&fold->rcu, basic_delete_filter); diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 520c5027646a..6c6b21f6ba62 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -17,6 +17,7 @@ #include <linux/skbuff.h> #include <linux/filter.h> #include <linux/bpf.h> +#include <linux/idr.h> #include <net/rtnetlink.h> #include <net/pkt_cls.h> @@ -32,7 +33,7 @@ MODULE_DESCRIPTION("TC BPF based classifier"); struct cls_bpf_head { struct list_head plist; - u32 hgen; + struct idr handle_idr; struct rcu_head rcu; }; @@ -99,11 +100,11 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, } else if (at_ingress) { /* It is safe to push/pull even if skb_shared() */ __skb_push(skb, skb->mac_len); - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); filter_res = BPF_PROG_RUN(prog->filter, skb); __skb_pull(skb, skb->mac_len); } else { - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); filter_res = BPF_PROG_RUN(prog->filter, skb); } @@ -238,6 +239,7 @@ static int cls_bpf_init(struct tcf_proto *tp) return -ENOBUFS; INIT_LIST_HEAD_RCU(&head->plist); + idr_init(&head->handle_idr); rcu_assign_pointer(tp->root, head); return 0; @@ -264,6 +266,9 @@ static void cls_bpf_delete_prog_rcu(struct rcu_head *rcu) static void __cls_bpf_delete(struct tcf_proto *tp, struct cls_bpf_prog *prog) { + struct cls_bpf_head *head = rtnl_dereference(tp->root); + + idr_remove_ext(&head->handle_idr, prog->handle); cls_bpf_stop_offload(tp, prog); list_del_rcu(&prog->link); tcf_unbind_filter(tp, &prog->res); @@ -287,6 +292,7 @@ static void cls_bpf_destroy(struct tcf_proto *tp) list_for_each_entry_safe(prog, tmp, &head->plist, link) __cls_bpf_delete(tp, prog); + idr_destroy(&head->handle_idr); kfree_rcu(head, rcu); } @@ -421,27 +427,6 @@ static int cls_bpf_set_parms(struct net *net, struct tcf_proto *tp, return 0; } -static u32 cls_bpf_grab_new_handle(struct tcf_proto *tp, - struct cls_bpf_head *head) -{ - unsigned int i = 0x80000000; - u32 handle; - - do { - if (++head->hgen == 0x7FFFFFFF) - head->hgen = 1; - } while (--i > 0 && cls_bpf_get(tp, head->hgen)); - - if (unlikely(i == 0)) { - pr_err("Insufficient number of handles\n"); - handle = 0; - } else { - handle = head->hgen; - } - - return handle; -} - static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, @@ -451,6 +436,7 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, struct cls_bpf_prog *oldprog = *arg; struct nlattr *tb[TCA_BPF_MAX + 1]; struct cls_bpf_prog *prog; + unsigned long idr_index; int ret; if (tca[TCA_OPTIONS] == NULL) @@ -476,21 +462,30 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, } } - if (handle == 0) - prog->handle = cls_bpf_grab_new_handle(tp, head); - else + if (handle == 0) { + ret = idr_alloc_ext(&head->handle_idr, prog, &idr_index, + 1, 0x7FFFFFFF, GFP_KERNEL); + if (ret) + goto errout; + prog->handle = idr_index; + } else { + if (!oldprog) { + ret = idr_alloc_ext(&head->handle_idr, prog, &idr_index, + handle, handle + 1, GFP_KERNEL); + if (ret) + goto errout; + } prog->handle = handle; - if (prog->handle == 0) { - ret = -EINVAL; - goto errout; } ret = cls_bpf_set_parms(net, tp, prog, base, tb, tca[TCA_RATE], ovr); if (ret < 0) - goto errout; + goto errout_idr; ret = cls_bpf_offload(tp, prog, oldprog); if (ret) { + if (!oldprog) + idr_remove_ext(&head->handle_idr, prog->handle); __cls_bpf_delete_prog(prog); return ret; } @@ -499,6 +494,7 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, prog->gen_flags |= TCA_CLS_FLAGS_NOT_IN_HW; if (oldprog) { + idr_replace_ext(&head->handle_idr, prog, handle); list_replace_rcu(&oldprog->link, &prog->link); tcf_unbind_filter(tp, &oldprog->res); call_rcu(&oldprog->rcu, cls_bpf_delete_prog_rcu); @@ -509,6 +505,9 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, *arg = prog; return 0; +errout_idr: + if (!oldprog) + idr_remove_ext(&head->handle_idr, prog->handle); errout: tcf_exts_destroy(&prog->exts); kfree(prog); diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 1a267e77c6de..db831ac708f6 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -152,37 +152,12 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct cls_fl_filter *f; struct fl_flow_key skb_key; struct fl_flow_key skb_mkey; - struct ip_tunnel_info *info; if (!atomic_read(&head->ht.nelems)) return -1; fl_clear_masked_range(&skb_key, &head->mask); - info = skb_tunnel_info(skb); - if (info) { - struct ip_tunnel_key *key = &info->key; - - switch (ip_tunnel_info_af(info)) { - case AF_INET: - skb_key.enc_control.addr_type = - FLOW_DISSECTOR_KEY_IPV4_ADDRS; - skb_key.enc_ipv4.src = key->u.ipv4.src; - skb_key.enc_ipv4.dst = key->u.ipv4.dst; - break; - case AF_INET6: - skb_key.enc_control.addr_type = - FLOW_DISSECTOR_KEY_IPV6_ADDRS; - skb_key.enc_ipv6.src = key->u.ipv6.src; - skb_key.enc_ipv6.dst = key->u.ipv6.dst; - break; - } - - skb_key.enc_key_id.keyid = tunnel_id_to_key32(key->tun_id); - skb_key.enc_tp.src = key->tp_src; - skb_key.enc_tp.dst = key->tp_dst; - } - skb_key.indev_ifindex = skb->skb_iif; /* skb_flow_dissect() does not set n_proto in case an unknown protocol, * so do it rather here. @@ -922,28 +897,28 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, if (!tc_flags_valid(fnew->flags)) { err = -EINVAL; - goto errout; + goto errout_idr; } } err = fl_set_parms(net, tp, fnew, &mask, base, tb, tca[TCA_RATE], ovr); if (err) - goto errout; + goto errout_idr; err = fl_check_assign_mask(head, &mask); if (err) - goto errout; + goto errout_idr; if (!tc_skip_sw(fnew->flags)) { if (!fold && fl_lookup(head, &fnew->mkey)) { err = -EEXIST; - goto errout; + goto errout_idr; } err = rhashtable_insert_fast(&head->ht, &fnew->ht_node, head->ht_params); if (err) - goto errout; + goto errout_idr; } if (!tc_skip_hw(fnew->flags)) { @@ -952,7 +927,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, &mask.key, fnew); if (err) - goto errout; + goto errout_idr; } if (!tc_in_hw(fnew->flags)) @@ -981,6 +956,9 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, kfree(tb); return 0; +errout_idr: + if (fnew->handle) + idr_remove_ext(&head->handle_idr, fnew->handle); errout: tcf_exts_destroy(&fnew->exts); kfree(fnew); diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index 21cc45caf842..eeac606c95ab 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -32,6 +32,7 @@ static int mall_classify(struct sk_buff *skb, const struct tcf_proto *tp, if (tc_skip_sw(head->flags)) return -1; + *res = head->res; return tcf_exts_exec(skb, &head->exts, res); } diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 10b8d851fc6b..094d224411a9 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -46,6 +46,7 @@ #include <net/act_api.h> #include <net/pkt_cls.h> #include <linux/netdevice.h> +#include <linux/idr.h> struct tc_u_knode { struct tc_u_knode __rcu *next; @@ -82,6 +83,7 @@ struct tc_u_hnode { struct tc_u_common *tp_c; int refcnt; unsigned int divisor; + struct idr handle_idr; struct rcu_head rcu; /* The 'ht' field MUST be the last field in structure to allow for * more entries allocated at end of structure. @@ -93,7 +95,7 @@ struct tc_u_common { struct tc_u_hnode __rcu *hlist; struct Qdisc *q; int refcnt; - u32 hgenerator; + struct idr handle_idr; struct hlist_node hnode; struct rcu_head rcu; }; @@ -311,19 +313,19 @@ static void *u32_get(struct tcf_proto *tp, u32 handle) return u32_lookup_key(ht, handle); } -static u32 gen_new_htid(struct tc_u_common *tp_c) +static u32 gen_new_htid(struct tc_u_common *tp_c, struct tc_u_hnode *ptr) { - int i = 0x800; + unsigned long idr_index; + int err; - /* hgenerator only used inside rtnl lock it is safe to increment + /* This is only used inside rtnl lock it is safe to increment * without read _copy_ update semantics */ - do { - if (++tp_c->hgenerator == 0x7FF) - tp_c->hgenerator = 1; - } while (--i > 0 && u32_lookup_ht(tp_c, (tp_c->hgenerator|0x800)<<20)); - - return i > 0 ? (tp_c->hgenerator|0x800)<<20 : 0; + err = idr_alloc_ext(&tp_c->handle_idr, ptr, &idr_index, + 1, 0x7FF, GFP_KERNEL); + if (err) + return 0; + return (u32)(idr_index | 0x800) << 20; } static struct hlist_head *tc_u_common_hash; @@ -366,8 +368,9 @@ static int u32_init(struct tcf_proto *tp) return -ENOBUFS; root_ht->refcnt++; - root_ht->handle = tp_c ? gen_new_htid(tp_c) : 0x80000000; + root_ht->handle = tp_c ? gen_new_htid(tp_c, root_ht) : 0x80000000; root_ht->prio = tp->prio; + idr_init(&root_ht->handle_idr); if (tp_c == NULL) { tp_c = kzalloc(sizeof(*tp_c), GFP_KERNEL); @@ -377,6 +380,7 @@ static int u32_init(struct tcf_proto *tp) } tp_c->q = tp->q; INIT_HLIST_NODE(&tp_c->hnode); + idr_init(&tp_c->handle_idr); h = tc_u_hash(tp); hlist_add_head(&tp_c->hnode, &tc_u_common_hash[h]); @@ -565,6 +569,7 @@ static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht) rtnl_dereference(n->next)); tcf_unbind_filter(tp, &n->res); u32_remove_hw_knode(tp, n->handle); + idr_remove_ext(&ht->handle_idr, n->handle); call_rcu(&n->rcu, u32_delete_key_freepf_rcu); } } @@ -586,6 +591,8 @@ static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht) hn = &phn->next, phn = rtnl_dereference(*hn)) { if (phn == ht) { u32_clear_hw_hnode(tp, ht); + idr_destroy(&ht->handle_idr); + idr_remove_ext(&tp_c->handle_idr, ht->handle); RCU_INIT_POINTER(*hn, ht->next); kfree_rcu(ht, rcu); return 0; @@ -633,6 +640,7 @@ static void u32_destroy(struct tcf_proto *tp) kfree_rcu(ht, rcu); } + idr_destroy(&tp_c->handle_idr); kfree(tp_c); } @@ -701,27 +709,21 @@ ret: return ret; } -#define NR_U32_NODE (1<<12) -static u32 gen_new_kid(struct tc_u_hnode *ht, u32 handle) +static u32 gen_new_kid(struct tc_u_hnode *ht, u32 htid) { - struct tc_u_knode *n; - unsigned long i; - unsigned long *bitmap = kzalloc(BITS_TO_LONGS(NR_U32_NODE) * sizeof(unsigned long), - GFP_KERNEL); - if (!bitmap) - return handle | 0xFFF; - - for (n = rtnl_dereference(ht->ht[TC_U32_HASH(handle)]); - n; - n = rtnl_dereference(n->next)) - set_bit(TC_U32_NODE(n->handle), bitmap); - - i = find_next_zero_bit(bitmap, NR_U32_NODE, 0x800); - if (i >= NR_U32_NODE) - i = find_next_zero_bit(bitmap, NR_U32_NODE, 1); + unsigned long idr_index; + u32 start = htid | 0x800; + u32 max = htid | 0xFFF; + u32 min = htid; + + if (idr_alloc_ext(&ht->handle_idr, NULL, &idr_index, + start, max + 1, GFP_KERNEL)) { + if (idr_alloc_ext(&ht->handle_idr, NULL, &idr_index, + min + 1, max + 1, GFP_KERNEL)) + return max; + } - kfree(bitmap); - return handle | (i >= NR_U32_NODE ? 0xFFF : i); + return (u32)idr_index; } static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = { @@ -806,6 +808,7 @@ static void u32_replace_knode(struct tcf_proto *tp, struct tc_u_common *tp_c, if (pins->handle == n->handle) break; + idr_replace_ext(&ht->handle_idr, n, n->handle); RCU_INIT_POINTER(n->next, pins->next); rcu_assign_pointer(*ins, n); } @@ -937,22 +940,33 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, return -EINVAL; if (TC_U32_KEY(handle)) return -EINVAL; - if (handle == 0) { - handle = gen_new_htid(tp->data); - if (handle == 0) - return -ENOMEM; - } ht = kzalloc(sizeof(*ht) + divisor*sizeof(void *), GFP_KERNEL); if (ht == NULL) return -ENOBUFS; + if (handle == 0) { + handle = gen_new_htid(tp->data, ht); + if (handle == 0) { + kfree(ht); + return -ENOMEM; + } + } else { + err = idr_alloc_ext(&tp_c->handle_idr, ht, NULL, + handle, handle + 1, GFP_KERNEL); + if (err) { + kfree(ht); + return err; + } + } ht->tp_c = tp_c; ht->refcnt = 1; ht->divisor = divisor; ht->handle = handle; ht->prio = tp->prio; + idr_init(&ht->handle_idr); err = u32_replace_hw_hnode(tp, ht, flags); if (err) { + idr_remove_ext(&tp_c->handle_idr, handle); kfree(ht); return err; } @@ -986,24 +1000,33 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, if (TC_U32_HTID(handle) && TC_U32_HTID(handle^htid)) return -EINVAL; handle = htid | TC_U32_NODE(handle); + err = idr_alloc_ext(&ht->handle_idr, NULL, NULL, + handle, handle + 1, + GFP_KERNEL); + if (err) + return err; } else handle = gen_new_kid(ht, htid); - if (tb[TCA_U32_SEL] == NULL) - return -EINVAL; + if (tb[TCA_U32_SEL] == NULL) { + err = -EINVAL; + goto erridr; + } s = nla_data(tb[TCA_U32_SEL]); n = kzalloc(sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key), GFP_KERNEL); - if (n == NULL) - return -ENOBUFS; + if (n == NULL) { + err = -ENOBUFS; + goto erridr; + } #ifdef CONFIG_CLS_U32_PERF size = sizeof(struct tc_u32_pcnt) + s->nkeys * sizeof(u64); n->pf = __alloc_percpu(size, __alignof__(struct tc_u32_pcnt)); if (!n->pf) { - kfree(n); - return -ENOBUFS; + err = -ENOBUFS; + goto errfree; } #endif @@ -1066,9 +1089,12 @@ errhw: errout: tcf_exts_destroy(&n->exts); #ifdef CONFIG_CLS_U32_PERF +errfree: free_percpu(n->pf); #endif kfree(n); +erridr: + idr_remove_ext(&ht->handle_idr, handle); return err; } diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 92237e75dbbc..a0a198768aad 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -685,13 +685,12 @@ void qdisc_reset(struct Qdisc *qdisc) qdisc->gso_skb = NULL; } qdisc->q.qlen = 0; + qdisc->qstats.backlog = 0; } EXPORT_SYMBOL(qdisc_reset); -static void qdisc_rcu_free(struct rcu_head *head) +static void qdisc_free(struct Qdisc *qdisc) { - struct Qdisc *qdisc = container_of(head, struct Qdisc, rcu_head); - if (qdisc_is_percpu_stats(qdisc)) { free_percpu(qdisc->cpu_bstats); free_percpu(qdisc->cpu_qstats); @@ -724,11 +723,7 @@ void qdisc_destroy(struct Qdisc *qdisc) kfree_skb_list(qdisc->gso_skb); kfree_skb(qdisc->skb_bad_txq); - /* - * gen_estimator est_timer() might access qdisc->q.lock, - * wait a RCU grace period before freeing qdisc. - */ - call_rcu(&qdisc->rcu_head, qdisc_rcu_free); + qdisc_free(qdisc); } EXPORT_SYMBOL(qdisc_destroy); diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index daaf214e5201..3f88b75488b0 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -958,6 +958,8 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, } if (cl != NULL) { + int old_flags; + if (parentid) { if (cl->cl_parent && cl->cl_parent->cl_common.classid != parentid) @@ -978,6 +980,8 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, } sch_tree_lock(sch); + old_flags = cl->cl_flags; + if (rsc != NULL) hfsc_change_rsc(cl, rsc, cur_time); if (fsc != NULL) @@ -986,10 +990,21 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, hfsc_change_usc(cl, usc, cur_time); if (cl->qdisc->q.qlen != 0) { - if (cl->cl_flags & HFSC_RSC) - update_ed(cl, qdisc_peek_len(cl->qdisc)); - if (cl->cl_flags & HFSC_FSC) - update_vf(cl, 0, cur_time); + int len = qdisc_peek_len(cl->qdisc); + + if (cl->cl_flags & HFSC_RSC) { + if (old_flags & HFSC_RSC) + update_ed(cl, len); + else + init_ed(cl, len); + } + + if (cl->cl_flags & HFSC_FSC) { + if (old_flags & HFSC_FSC) + update_vf(cl, 0, cur_time); + else + init_vf(cl, len); + } } sch_tree_unlock(sch); diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index b1266e75ca43..5a4f10080290 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -146,7 +146,6 @@ struct netem_sched_data { */ struct netem_skb_cb { psched_time_t time_to_send; - ktime_t tstamp_save; }; @@ -362,12 +361,13 @@ static psched_time_t packet_len_2_sched_time(unsigned int len, struct netem_sche static void tfifo_reset(struct Qdisc *sch) { struct netem_sched_data *q = qdisc_priv(sch); - struct rb_node *p; + struct rb_node *p = rb_first(&q->t_root); - while ((p = rb_first(&q->t_root))) { + while (p) { struct sk_buff *skb = netem_rb_to_skb(p); - rb_erase(p, &q->t_root); + p = rb_next(p); + rb_erase(&skb->rbnode, &q->t_root); rtnl_kfree_skbs(skb, skb); } } @@ -561,7 +561,6 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, } cb->time_to_send = now + delay; - cb->tstamp_save = skb->tstamp; ++q->counter; tfifo_enqueue(skb, sch); } else { @@ -629,7 +628,10 @@ deliver: qdisc_qstats_backlog_dec(sch, skb); skb->next = NULL; skb->prev = NULL; - skb->tstamp = netem_skb_cb(skb)->tstamp_save; + /* skb->dev shares skb->rbnode area, + * we need to restore its value. + */ + skb->dev = qdisc_dev(sch); #ifdef CONFIG_NET_CLS_ACT /* diff --git a/net/sctp/Makefile b/net/sctp/Makefile index 70f1b570bab9..bf90c5397719 100644 --- a/net/sctp/Makefile +++ b/net/sctp/Makefile @@ -12,7 +12,8 @@ sctp-y := sm_statetable.o sm_statefuns.o sm_sideeffect.o \ inqueue.o outqueue.o ulpqueue.o \ tsnmap.o bind_addr.o socket.o primitive.o \ output.o input.o debug.o stream.o auth.o \ - offload.o + offload.o stream_sched.o stream_sched_prio.o \ + stream_sched_rr.o sctp_probe-y := probe.o diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index 3afac275ee82..7b261afc47b9 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -311,10 +311,10 @@ int sctp_chunk_abandoned(struct sctp_chunk *chunk) if (chunk->sent_count) { chunk->asoc->abandoned_sent[SCTP_PR_INDEX(TTL)]++; - streamout->abandoned_sent[SCTP_PR_INDEX(TTL)]++; + streamout->ext->abandoned_sent[SCTP_PR_INDEX(TTL)]++; } else { chunk->asoc->abandoned_unsent[SCTP_PR_INDEX(TTL)]++; - streamout->abandoned_unsent[SCTP_PR_INDEX(TTL)]++; + streamout->ext->abandoned_unsent[SCTP_PR_INDEX(TTL)]++; } return 1; } else if (SCTP_PR_RTX_ENABLED(chunk->sinfo.sinfo_flags) && @@ -323,7 +323,7 @@ int sctp_chunk_abandoned(struct sctp_chunk *chunk) &chunk->asoc->stream.out[chunk->sinfo.sinfo_stream]; chunk->asoc->abandoned_sent[SCTP_PR_INDEX(RTX)]++; - streamout->abandoned_sent[SCTP_PR_INDEX(RTX)]++; + streamout->ext->abandoned_sent[SCTP_PR_INDEX(RTX)]++; return 1; } else if (!SCTP_PR_POLICY(chunk->sinfo.sinfo_flags) && chunk->msg->expires_at && diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index 2966ff400755..4db012aa25f7 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -50,6 +50,7 @@ #include <net/sctp/sctp.h> #include <net/sctp/sm.h> +#include <net/sctp/stream_sched.h> /* Declare internal functions here. */ static int sctp_acked(struct sctp_sackhdr *sack, __u32 tsn); @@ -72,32 +73,38 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp); /* Add data to the front of the queue. */ static inline void sctp_outq_head_data(struct sctp_outq *q, - struct sctp_chunk *ch) + struct sctp_chunk *ch) { + struct sctp_stream_out_ext *oute; + __u16 stream; + list_add(&ch->list, &q->out_chunk_list); q->out_qlen += ch->skb->len; + + stream = sctp_chunk_stream_no(ch); + oute = q->asoc->stream.out[stream].ext; + list_add(&ch->stream_list, &oute->outq); } /* Take data from the front of the queue. */ static inline struct sctp_chunk *sctp_outq_dequeue_data(struct sctp_outq *q) { - struct sctp_chunk *ch = NULL; - - if (!list_empty(&q->out_chunk_list)) { - struct list_head *entry = q->out_chunk_list.next; - - ch = list_entry(entry, struct sctp_chunk, list); - list_del_init(entry); - q->out_qlen -= ch->skb->len; - } - return ch; + return q->sched->dequeue(q); } + /* Add data chunk to the end of the queue. */ static inline void sctp_outq_tail_data(struct sctp_outq *q, struct sctp_chunk *ch) { + struct sctp_stream_out_ext *oute; + __u16 stream; + list_add_tail(&ch->list, &q->out_chunk_list); q->out_qlen += ch->skb->len; + + stream = sctp_chunk_stream_no(ch); + oute = q->asoc->stream.out[stream].ext; + list_add_tail(&ch->stream_list, &oute->outq); } /* @@ -207,6 +214,7 @@ void sctp_outq_init(struct sctp_association *asoc, struct sctp_outq *q) INIT_LIST_HEAD(&q->retransmit); INIT_LIST_HEAD(&q->sacked); INIT_LIST_HEAD(&q->abandoned); + sctp_sched_set_sched(asoc, SCTP_SS_FCFS); } /* Free the outqueue structure and any related pending chunks. @@ -258,6 +266,7 @@ static void __sctp_outq_teardown(struct sctp_outq *q) /* Throw away any leftover data chunks. */ while ((chunk = sctp_outq_dequeue_data(q)) != NULL) { + sctp_sched_dequeue_done(q, chunk); /* Mark as send failure. */ sctp_chunk_fail(chunk, q->error); @@ -366,7 +375,7 @@ static int sctp_prsctp_prune_sent(struct sctp_association *asoc, streamout = &asoc->stream.out[chk->sinfo.sinfo_stream]; asoc->sent_cnt_removable--; asoc->abandoned_sent[SCTP_PR_INDEX(PRIO)]++; - streamout->abandoned_sent[SCTP_PR_INDEX(PRIO)]++; + streamout->ext->abandoned_sent[SCTP_PR_INDEX(PRIO)]++; if (!chk->tsn_gap_acked) { if (chk->transport) @@ -391,20 +400,21 @@ static int sctp_prsctp_prune_unsent(struct sctp_association *asoc, struct sctp_outq *q = &asoc->outqueue; struct sctp_chunk *chk, *temp; + q->sched->unsched_all(&asoc->stream); + list_for_each_entry_safe(chk, temp, &q->out_chunk_list, list) { if (!SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) || chk->sinfo.sinfo_timetolive <= sinfo->sinfo_timetolive) continue; - list_del_init(&chk->list); - q->out_qlen -= chk->skb->len; + sctp_sched_dequeue_common(q, chk); asoc->sent_cnt_removable--; asoc->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++; if (chk->sinfo.sinfo_stream < asoc->stream.outcnt) { struct sctp_stream_out *streamout = &asoc->stream.out[chk->sinfo.sinfo_stream]; - streamout->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++; + streamout->ext->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++; } msg_len -= SCTP_DATA_SNDSIZE(chk) + @@ -415,6 +425,8 @@ static int sctp_prsctp_prune_unsent(struct sctp_association *asoc, break; } + q->sched->sched_all(&asoc->stream); + return msg_len; } @@ -1033,22 +1045,9 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp) while ((chunk = sctp_outq_dequeue_data(q)) != NULL) { __u32 sid = ntohs(chunk->subh.data_hdr->stream); - /* RFC 2960 6.5 Every DATA chunk MUST carry a valid - * stream identifier. - */ - if (chunk->sinfo.sinfo_stream >= asoc->stream.outcnt) { - - /* Mark as failed send. */ - sctp_chunk_fail(chunk, SCTP_ERROR_INV_STRM); - if (asoc->peer.prsctp_capable && - SCTP_PR_PRIO_ENABLED(chunk->sinfo.sinfo_flags)) - asoc->sent_cnt_removable--; - sctp_chunk_free(chunk); - continue; - } - /* Has this chunk expired? */ if (sctp_chunk_abandoned(chunk)) { + sctp_sched_dequeue_done(q, chunk); sctp_chunk_fail(chunk, 0); sctp_chunk_free(chunk); continue; @@ -1070,6 +1069,7 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp) new_transport = asoc->peer.active_path; if (new_transport->state == SCTP_UNCONFIRMED) { WARN_ONCE(1, "Attempt to send packet on unconfirmed path."); + sctp_sched_dequeue_done(q, chunk); sctp_chunk_fail(chunk, 0); sctp_chunk_free(chunk); continue; @@ -1133,6 +1133,11 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp) else asoc->stats.oodchunks++; + /* Only now it's safe to consider this + * chunk as sent, sched-wise. + */ + sctp_sched_dequeue_done(q, chunk); + break; default: diff --git a/net/sctp/sctp_diag.c b/net/sctp/sctp_diag.c index 22ed01a76b19..a72a7d925d46 100644 --- a/net/sctp/sctp_diag.c +++ b/net/sctp/sctp_diag.c @@ -463,6 +463,7 @@ static void sctp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, .r = r, .net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN), }; + int pos = cb->args[2]; /* eps hashtable dumps * args: @@ -493,7 +494,8 @@ skip: goto done; sctp_for_each_transport(sctp_sock_filter, sctp_sock_dump, - net, (int *)&cb->args[2], &commp); + net, &pos, &commp); + cb->args[2] = pos; done: cb->args[1] = cb->args[4]; diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index e6a2974e020e..402bfbb888cd 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -50,6 +50,7 @@ #include <net/sock.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> +#include <net/sctp/stream_sched.h> static int sctp_cmd_interpreter(enum sctp_event event_type, union sctp_subtype subtype, @@ -1089,6 +1090,8 @@ static void sctp_cmd_send_msg(struct sctp_association *asoc, list_for_each_entry(chunk, &msg->chunks, frag_list) sctp_outq_tail(&asoc->outqueue, chunk, gfp); + + asoc->outqueue.sched->enqueue(&asoc->outqueue, msg); } diff --git a/net/sctp/socket.c b/net/sctp/socket.c index d4730ada7f32..88c28421ec15 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -79,6 +79,7 @@ #include <net/sock.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> +#include <net/sctp/stream_sched.h> /* Forward declarations for internal helper functions. */ static int sctp_writeable(struct sock *sk); @@ -1927,6 +1928,13 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) goto out_free; } + /* Allocate sctp_stream_out_ext if not already done */ + if (unlikely(!asoc->stream.out[sinfo->sinfo_stream].ext)) { + err = sctp_stream_init_ext(&asoc->stream, sinfo->sinfo_stream); + if (err) + goto out_free; + } + if (sctp_wspace(asoc) < msg_len) sctp_prsctp_prune(asoc, sinfo, msg_len - sctp_wspace(asoc)); @@ -3907,6 +3915,64 @@ out: return retval; } +static int sctp_setsockopt_scheduler(struct sock *sk, + char __user *optval, + unsigned int optlen) +{ + struct sctp_association *asoc; + struct sctp_assoc_value params; + int retval = -EINVAL; + + if (optlen < sizeof(params)) + goto out; + + optlen = sizeof(params); + if (copy_from_user(¶ms, optval, optlen)) { + retval = -EFAULT; + goto out; + } + + if (params.assoc_value > SCTP_SS_MAX) + goto out; + + asoc = sctp_id2assoc(sk, params.assoc_id); + if (!asoc) + goto out; + + retval = sctp_sched_set_sched(asoc, params.assoc_value); + +out: + return retval; +} + +static int sctp_setsockopt_scheduler_value(struct sock *sk, + char __user *optval, + unsigned int optlen) +{ + struct sctp_association *asoc; + struct sctp_stream_value params; + int retval = -EINVAL; + + if (optlen < sizeof(params)) + goto out; + + optlen = sizeof(params); + if (copy_from_user(¶ms, optval, optlen)) { + retval = -EFAULT; + goto out; + } + + asoc = sctp_id2assoc(sk, params.assoc_id); + if (!asoc) + goto out; + + retval = sctp_sched_set_value(asoc, params.stream_id, + params.stream_value, GFP_KERNEL); + +out: + return retval; +} + /* API 6.2 setsockopt(), getsockopt() * * Applications use setsockopt() and getsockopt() to set or retrieve @@ -4088,6 +4154,12 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname, case SCTP_ADD_STREAMS: retval = sctp_setsockopt_add_streams(sk, optval, optlen); break; + case SCTP_STREAM_SCHEDULER: + retval = sctp_setsockopt_scheduler(sk, optval, optlen); + break; + case SCTP_STREAM_SCHEDULER_VALUE: + retval = sctp_setsockopt_scheduler_value(sk, optval, optlen); + break; default: retval = -ENOPROTOOPT; break; @@ -6645,7 +6717,7 @@ static int sctp_getsockopt_pr_streamstatus(struct sock *sk, int len, char __user *optval, int __user *optlen) { - struct sctp_stream_out *streamout; + struct sctp_stream_out_ext *streamoute; struct sctp_association *asoc; struct sctp_prstatus params; int retval = -EINVAL; @@ -6668,21 +6740,29 @@ static int sctp_getsockopt_pr_streamstatus(struct sock *sk, int len, if (!asoc || params.sprstat_sid >= asoc->stream.outcnt) goto out; - streamout = &asoc->stream.out[params.sprstat_sid]; + streamoute = asoc->stream.out[params.sprstat_sid].ext; + if (!streamoute) { + /* Not allocated yet, means all stats are 0 */ + params.sprstat_abandoned_unsent = 0; + params.sprstat_abandoned_sent = 0; + retval = 0; + goto out; + } + if (policy == SCTP_PR_SCTP_NONE) { params.sprstat_abandoned_unsent = 0; params.sprstat_abandoned_sent = 0; for (policy = 0; policy <= SCTP_PR_INDEX(MAX); policy++) { params.sprstat_abandoned_unsent += - streamout->abandoned_unsent[policy]; + streamoute->abandoned_unsent[policy]; params.sprstat_abandoned_sent += - streamout->abandoned_sent[policy]; + streamoute->abandoned_sent[policy]; } } else { params.sprstat_abandoned_unsent = - streamout->abandoned_unsent[__SCTP_PR_INDEX(policy)]; + streamoute->abandoned_unsent[__SCTP_PR_INDEX(policy)]; params.sprstat_abandoned_sent = - streamout->abandoned_sent[__SCTP_PR_INDEX(policy)]; + streamoute->abandoned_sent[__SCTP_PR_INDEX(policy)]; } if (put_user(len, optlen) || copy_to_user(optval, ¶ms, len)) { @@ -6778,6 +6858,85 @@ out: return retval; } +static int sctp_getsockopt_scheduler(struct sock *sk, int len, + char __user *optval, + int __user *optlen) +{ + struct sctp_assoc_value params; + struct sctp_association *asoc; + int retval = -EFAULT; + + if (len < sizeof(params)) { + retval = -EINVAL; + goto out; + } + + len = sizeof(params); + if (copy_from_user(¶ms, optval, len)) + goto out; + + asoc = sctp_id2assoc(sk, params.assoc_id); + if (!asoc) { + retval = -EINVAL; + goto out; + } + + params.assoc_value = sctp_sched_get_sched(asoc); + + if (put_user(len, optlen)) + goto out; + + if (copy_to_user(optval, ¶ms, len)) + goto out; + + retval = 0; + +out: + return retval; +} + +static int sctp_getsockopt_scheduler_value(struct sock *sk, int len, + char __user *optval, + int __user *optlen) +{ + struct sctp_stream_value params; + struct sctp_association *asoc; + int retval = -EFAULT; + + if (len < sizeof(params)) { + retval = -EINVAL; + goto out; + } + + len = sizeof(params); + if (copy_from_user(¶ms, optval, len)) + goto out; + + asoc = sctp_id2assoc(sk, params.assoc_id); + if (!asoc) { + retval = -EINVAL; + goto out; + } + + retval = sctp_sched_get_value(asoc, params.stream_id, + ¶ms.stream_value); + if (retval) + goto out; + + if (put_user(len, optlen)) { + retval = -EFAULT; + goto out; + } + + if (copy_to_user(optval, ¶ms, len)) { + retval = -EFAULT; + goto out; + } + +out: + return retval; +} + static int sctp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { @@ -6960,6 +7119,14 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname, retval = sctp_getsockopt_enable_strreset(sk, len, optval, optlen); break; + case SCTP_STREAM_SCHEDULER: + retval = sctp_getsockopt_scheduler(sk, len, optval, + optlen); + break; + case SCTP_STREAM_SCHEDULER_VALUE: + retval = sctp_getsockopt_scheduler_value(sk, len, optval, + optlen); + break; default: retval = -ENOPROTOOPT; break; diff --git a/net/sctp/stream.c b/net/sctp/stream.c index 63ea15503714..5ea33a2c453b 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -32,44 +32,181 @@ * Xin Long <lucien.xin@gmail.com> */ +#include <linux/list.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> +#include <net/sctp/stream_sched.h> + +/* Migrates chunks from stream queues to new stream queues if needed, + * but not across associations. Also, removes those chunks to streams + * higher than the new max. + */ +static void sctp_stream_outq_migrate(struct sctp_stream *stream, + struct sctp_stream *new, __u16 outcnt) +{ + struct sctp_association *asoc; + struct sctp_chunk *ch, *temp; + struct sctp_outq *outq; + int i; + + asoc = container_of(stream, struct sctp_association, stream); + outq = &asoc->outqueue; + + list_for_each_entry_safe(ch, temp, &outq->out_chunk_list, list) { + __u16 sid = sctp_chunk_stream_no(ch); + + if (sid < outcnt) + continue; + + sctp_sched_dequeue_common(outq, ch); + /* No need to call dequeue_done here because + * the chunks are not scheduled by now. + */ + + /* Mark as failed send. */ + sctp_chunk_fail(ch, SCTP_ERROR_INV_STRM); + if (asoc->peer.prsctp_capable && + SCTP_PR_PRIO_ENABLED(ch->sinfo.sinfo_flags)) + asoc->sent_cnt_removable--; + + sctp_chunk_free(ch); + } + + if (new) { + /* Here we actually move the old ext stuff into the new + * buffer, because we want to keep it. Then + * sctp_stream_update will swap ->out pointers. + */ + for (i = 0; i < outcnt; i++) { + kfree(new->out[i].ext); + new->out[i].ext = stream->out[i].ext; + stream->out[i].ext = NULL; + } + } + + for (i = outcnt; i < stream->outcnt; i++) + kfree(stream->out[i].ext); +} + +static int sctp_stream_alloc_out(struct sctp_stream *stream, __u16 outcnt, + gfp_t gfp) +{ + struct sctp_stream_out *out; + + out = kmalloc_array(outcnt, sizeof(*out), gfp); + if (!out) + return -ENOMEM; + + if (stream->out) { + memcpy(out, stream->out, min(outcnt, stream->outcnt) * + sizeof(*out)); + kfree(stream->out); + } + + if (outcnt > stream->outcnt) + memset(out + stream->outcnt, 0, + (outcnt - stream->outcnt) * sizeof(*out)); + + stream->out = out; + + return 0; +} + +static int sctp_stream_alloc_in(struct sctp_stream *stream, __u16 incnt, + gfp_t gfp) +{ + struct sctp_stream_in *in; + + in = kmalloc_array(incnt, sizeof(*stream->in), gfp); + + if (!in) + return -ENOMEM; + + if (stream->in) { + memcpy(in, stream->in, min(incnt, stream->incnt) * + sizeof(*in)); + kfree(stream->in); + } + + if (incnt > stream->incnt) + memset(in + stream->incnt, 0, + (incnt - stream->incnt) * sizeof(*in)); + + stream->in = in; + + return 0; +} int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt, gfp_t gfp) { - int i; + struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); + int i, ret = 0; + + gfp |= __GFP_NOWARN; /* Initial stream->out size may be very big, so free it and alloc - * a new one with new outcnt to save memory. + * a new one with new outcnt to save memory if needed. */ - kfree(stream->out); + if (outcnt == stream->outcnt) + goto in; - stream->out = kcalloc(outcnt, sizeof(*stream->out), gfp); - if (!stream->out) - return -ENOMEM; + /* Filter out chunks queued on streams that won't exist anymore */ + sched->unsched_all(stream); + sctp_stream_outq_migrate(stream, NULL, outcnt); + sched->sched_all(stream); + + i = sctp_stream_alloc_out(stream, outcnt, gfp); + if (i) + return i; stream->outcnt = outcnt; for (i = 0; i < stream->outcnt; i++) stream->out[i].state = SCTP_STREAM_OPEN; + sched->init(stream); + +in: if (!incnt) - return 0; + goto out; - stream->in = kcalloc(incnt, sizeof(*stream->in), gfp); - if (!stream->in) { - kfree(stream->out); - stream->out = NULL; - return -ENOMEM; + i = sctp_stream_alloc_in(stream, incnt, gfp); + if (i) { + ret = -ENOMEM; + goto free; } stream->incnt = incnt; + goto out; - return 0; +free: + sched->free(stream); + kfree(stream->out); + stream->out = NULL; +out: + return ret; +} + +int sctp_stream_init_ext(struct sctp_stream *stream, __u16 sid) +{ + struct sctp_stream_out_ext *soute; + + soute = kzalloc(sizeof(*soute), GFP_KERNEL); + if (!soute) + return -ENOMEM; + stream->out[sid].ext = soute; + + return sctp_sched_init_sid(stream, sid, GFP_KERNEL); } void sctp_stream_free(struct sctp_stream *stream) { + struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); + int i; + + sched->free(stream); + for (i = 0; i < stream->outcnt; i++) + kfree(stream->out[i].ext); kfree(stream->out); kfree(stream->in); } @@ -87,6 +224,10 @@ void sctp_stream_clear(struct sctp_stream *stream) void sctp_stream_update(struct sctp_stream *stream, struct sctp_stream *new) { + struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); + + sched->unsched_all(stream); + sctp_stream_outq_migrate(stream, new, new->outcnt); sctp_stream_free(stream); stream->out = new->out; @@ -94,6 +235,8 @@ void sctp_stream_update(struct sctp_stream *stream, struct sctp_stream *new) stream->outcnt = new->outcnt; stream->incnt = new->incnt; + sched->sched_all(stream); + new->out = NULL; new->in = NULL; } @@ -270,15 +413,9 @@ int sctp_send_add_streams(struct sctp_association *asoc, } if (out) { - struct sctp_stream_out *streamout; - - streamout = krealloc(stream->out, outcnt * sizeof(*streamout), - GFP_KERNEL); - if (!streamout) + retval = sctp_stream_alloc_out(stream, outcnt, GFP_KERNEL); + if (retval) goto out; - - memset(streamout + stream->outcnt, 0, out * sizeof(*streamout)); - stream->out = streamout; } chunk = sctp_make_strreset_addstrm(asoc, out, in); @@ -601,7 +738,6 @@ struct sctp_chunk *sctp_process_strreset_addstrm_out( struct sctp_strreset_addstrm *addstrm = param.v; struct sctp_stream *stream = &asoc->stream; __u32 result = SCTP_STRRESET_DENIED; - struct sctp_stream_in *streamin; __u32 request_seq, incnt; __u16 in, i; @@ -648,13 +784,9 @@ struct sctp_chunk *sctp_process_strreset_addstrm_out( if (!in || incnt > SCTP_MAX_STREAM) goto out; - streamin = krealloc(stream->in, incnt * sizeof(*streamin), - GFP_ATOMIC); - if (!streamin) + if (sctp_stream_alloc_in(stream, incnt, GFP_ATOMIC)) goto out; - memset(streamin + stream->incnt, 0, in * sizeof(*streamin)); - stream->in = streamin; stream->incnt = incnt; result = SCTP_STRRESET_PERFORMED; @@ -676,10 +808,10 @@ struct sctp_chunk *sctp_process_strreset_addstrm_in( struct sctp_strreset_addstrm *addstrm = param.v; struct sctp_stream *stream = &asoc->stream; __u32 result = SCTP_STRRESET_DENIED; - struct sctp_stream_out *streamout; struct sctp_chunk *chunk = NULL; __u32 request_seq, outcnt; __u16 out, i; + int ret; request_seq = ntohl(addstrm->request_seq); if (TSN_lt(asoc->strreset_inseq, request_seq) || @@ -708,14 +840,10 @@ struct sctp_chunk *sctp_process_strreset_addstrm_in( if (!out || outcnt > SCTP_MAX_STREAM) goto out; - streamout = krealloc(stream->out, outcnt * sizeof(*streamout), - GFP_ATOMIC); - if (!streamout) + ret = sctp_stream_alloc_out(stream, outcnt, GFP_ATOMIC); + if (ret) goto out; - memset(streamout + stream->outcnt, 0, out * sizeof(*streamout)); - stream->out = streamout; - chunk = sctp_make_strreset_addstrm(asoc, out, 0); if (!chunk) goto out; diff --git a/net/sctp/stream_sched.c b/net/sctp/stream_sched.c new file mode 100644 index 000000000000..03513a9fa110 --- /dev/null +++ b/net/sctp/stream_sched.c @@ -0,0 +1,275 @@ +/* SCTP kernel implementation + * (C) Copyright Red Hat Inc. 2017 + * + * This file is part of the SCTP kernel implementation + * + * These functions manipulate sctp stream queue/scheduling. + * + * This SCTP implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This SCTP implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, see + * <http://www.gnu.org/licenses/>. + * + * Please send any bug reports or fixes you make to the + * email addresched(es): + * lksctp developers <linux-sctp@vger.kernel.org> + * + * Written or modified by: + * Marcelo Ricardo Leitner <marcelo.leitner@gmail.com> + */ + +#include <linux/list.h> +#include <net/sctp/sctp.h> +#include <net/sctp/sm.h> +#include <net/sctp/stream_sched.h> + +/* First Come First Serve (a.k.a. FIFO) + * RFC DRAFT ndata Section 3.1 + */ +static int sctp_sched_fcfs_set(struct sctp_stream *stream, __u16 sid, + __u16 value, gfp_t gfp) +{ + return 0; +} + +static int sctp_sched_fcfs_get(struct sctp_stream *stream, __u16 sid, + __u16 *value) +{ + *value = 0; + return 0; +} + +static int sctp_sched_fcfs_init(struct sctp_stream *stream) +{ + return 0; +} + +static int sctp_sched_fcfs_init_sid(struct sctp_stream *stream, __u16 sid, + gfp_t gfp) +{ + return 0; +} + +static void sctp_sched_fcfs_free(struct sctp_stream *stream) +{ +} + +static void sctp_sched_fcfs_enqueue(struct sctp_outq *q, + struct sctp_datamsg *msg) +{ +} + +static struct sctp_chunk *sctp_sched_fcfs_dequeue(struct sctp_outq *q) +{ + struct sctp_stream *stream = &q->asoc->stream; + struct sctp_chunk *ch = NULL; + struct list_head *entry; + + if (list_empty(&q->out_chunk_list)) + goto out; + + if (stream->out_curr) { + ch = list_entry(stream->out_curr->ext->outq.next, + struct sctp_chunk, stream_list); + } else { + entry = q->out_chunk_list.next; + ch = list_entry(entry, struct sctp_chunk, list); + } + + sctp_sched_dequeue_common(q, ch); + +out: + return ch; +} + +static void sctp_sched_fcfs_dequeue_done(struct sctp_outq *q, + struct sctp_chunk *chunk) +{ +} + +static void sctp_sched_fcfs_sched_all(struct sctp_stream *stream) +{ +} + +static void sctp_sched_fcfs_unsched_all(struct sctp_stream *stream) +{ +} + +static struct sctp_sched_ops sctp_sched_fcfs = { + .set = sctp_sched_fcfs_set, + .get = sctp_sched_fcfs_get, + .init = sctp_sched_fcfs_init, + .init_sid = sctp_sched_fcfs_init_sid, + .free = sctp_sched_fcfs_free, + .enqueue = sctp_sched_fcfs_enqueue, + .dequeue = sctp_sched_fcfs_dequeue, + .dequeue_done = sctp_sched_fcfs_dequeue_done, + .sched_all = sctp_sched_fcfs_sched_all, + .unsched_all = sctp_sched_fcfs_unsched_all, +}; + +/* API to other parts of the stack */ + +extern struct sctp_sched_ops sctp_sched_prio; +extern struct sctp_sched_ops sctp_sched_rr; + +struct sctp_sched_ops *sctp_sched_ops[] = { + &sctp_sched_fcfs, + &sctp_sched_prio, + &sctp_sched_rr, +}; + +int sctp_sched_set_sched(struct sctp_association *asoc, + enum sctp_sched_type sched) +{ + struct sctp_sched_ops *n = sctp_sched_ops[sched]; + struct sctp_sched_ops *old = asoc->outqueue.sched; + struct sctp_datamsg *msg = NULL; + struct sctp_chunk *ch; + int i, ret = 0; + + if (old == n) + return ret; + + if (sched > SCTP_SS_MAX) + return -EINVAL; + + if (old) { + old->free(&asoc->stream); + + /* Give the next scheduler a clean slate. */ + for (i = 0; i < asoc->stream.outcnt; i++) { + void *p = asoc->stream.out[i].ext; + + if (!p) + continue; + + p += offsetofend(struct sctp_stream_out_ext, outq); + memset(p, 0, sizeof(struct sctp_stream_out_ext) - + offsetofend(struct sctp_stream_out_ext, outq)); + } + } + + asoc->outqueue.sched = n; + n->init(&asoc->stream); + for (i = 0; i < asoc->stream.outcnt; i++) { + if (!asoc->stream.out[i].ext) + continue; + + ret = n->init_sid(&asoc->stream, i, GFP_KERNEL); + if (ret) + goto err; + } + + /* We have to requeue all chunks already queued. */ + list_for_each_entry(ch, &asoc->outqueue.out_chunk_list, list) { + if (ch->msg == msg) + continue; + msg = ch->msg; + n->enqueue(&asoc->outqueue, msg); + } + + return ret; + +err: + n->free(&asoc->stream); + asoc->outqueue.sched = &sctp_sched_fcfs; /* Always safe */ + + return ret; +} + +int sctp_sched_get_sched(struct sctp_association *asoc) +{ + int i; + + for (i = 0; i <= SCTP_SS_MAX; i++) + if (asoc->outqueue.sched == sctp_sched_ops[i]) + return i; + + return 0; +} + +int sctp_sched_set_value(struct sctp_association *asoc, __u16 sid, + __u16 value, gfp_t gfp) +{ + if (sid >= asoc->stream.outcnt) + return -EINVAL; + + if (!asoc->stream.out[sid].ext) { + int ret; + + ret = sctp_stream_init_ext(&asoc->stream, sid); + if (ret) + return ret; + } + + return asoc->outqueue.sched->set(&asoc->stream, sid, value, gfp); +} + +int sctp_sched_get_value(struct sctp_association *asoc, __u16 sid, + __u16 *value) +{ + if (sid >= asoc->stream.outcnt) + return -EINVAL; + + if (!asoc->stream.out[sid].ext) + return 0; + + return asoc->outqueue.sched->get(&asoc->stream, sid, value); +} + +void sctp_sched_dequeue_done(struct sctp_outq *q, struct sctp_chunk *ch) +{ + if (!list_is_last(&ch->frag_list, &ch->msg->chunks)) { + struct sctp_stream_out *sout; + __u16 sid; + + /* datamsg is not finish, so save it as current one, + * in case application switch scheduler or a higher + * priority stream comes in. + */ + sid = sctp_chunk_stream_no(ch); + sout = &q->asoc->stream.out[sid]; + q->asoc->stream.out_curr = sout; + return; + } + + q->asoc->stream.out_curr = NULL; + q->sched->dequeue_done(q, ch); +} + +/* Auxiliary functions for the schedulers */ +void sctp_sched_dequeue_common(struct sctp_outq *q, struct sctp_chunk *ch) +{ + list_del_init(&ch->list); + list_del_init(&ch->stream_list); + q->out_qlen -= ch->skb->len; +} + +int sctp_sched_init_sid(struct sctp_stream *stream, __u16 sid, gfp_t gfp) +{ + struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); + + INIT_LIST_HEAD(&stream->out[sid].ext->outq); + return sched->init_sid(stream, sid, gfp); +} + +struct sctp_sched_ops *sctp_sched_ops_from_stream(struct sctp_stream *stream) +{ + struct sctp_association *asoc; + + asoc = container_of(stream, struct sctp_association, stream); + + return asoc->outqueue.sched; +} diff --git a/net/sctp/stream_sched_prio.c b/net/sctp/stream_sched_prio.c new file mode 100644 index 000000000000..384dbf3c8760 --- /dev/null +++ b/net/sctp/stream_sched_prio.c @@ -0,0 +1,347 @@ +/* SCTP kernel implementation + * (C) Copyright Red Hat Inc. 2017 + * + * This file is part of the SCTP kernel implementation + * + * These functions manipulate sctp stream queue/scheduling. + * + * This SCTP implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This SCTP implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, see + * <http://www.gnu.org/licenses/>. + * + * Please send any bug reports or fixes you make to the + * email addresched(es): + * lksctp developers <linux-sctp@vger.kernel.org> + * + * Written or modified by: + * Marcelo Ricardo Leitner <marcelo.leitner@gmail.com> + */ + +#include <linux/list.h> +#include <net/sctp/sctp.h> +#include <net/sctp/sm.h> +#include <net/sctp/stream_sched.h> + +/* Priority handling + * RFC DRAFT ndata section 3.4 + */ + +static void sctp_sched_prio_unsched_all(struct sctp_stream *stream); + +static struct sctp_stream_priorities *sctp_sched_prio_new_head( + struct sctp_stream *stream, int prio, gfp_t gfp) +{ + struct sctp_stream_priorities *p; + + p = kmalloc(sizeof(*p), gfp); + if (!p) + return NULL; + + INIT_LIST_HEAD(&p->prio_sched); + INIT_LIST_HEAD(&p->active); + p->next = NULL; + p->prio = prio; + + return p; +} + +static struct sctp_stream_priorities *sctp_sched_prio_get_head( + struct sctp_stream *stream, int prio, gfp_t gfp) +{ + struct sctp_stream_priorities *p; + int i; + + /* Look into scheduled priorities first, as they are sorted and + * we can find it fast IF it's scheduled. + */ + list_for_each_entry(p, &stream->prio_list, prio_sched) { + if (p->prio == prio) + return p; + if (p->prio > prio) + break; + } + + /* No luck. So we search on all streams now. */ + for (i = 0; i < stream->outcnt; i++) { + if (!stream->out[i].ext) + continue; + + p = stream->out[i].ext->prio_head; + if (!p) + /* Means all other streams won't be initialized + * as well. + */ + break; + if (p->prio == prio) + return p; + } + + /* If not even there, allocate a new one. */ + return sctp_sched_prio_new_head(stream, prio, gfp); +} + +static void sctp_sched_prio_next_stream(struct sctp_stream_priorities *p) +{ + struct list_head *pos; + + pos = p->next->prio_list.next; + if (pos == &p->active) + pos = pos->next; + p->next = list_entry(pos, struct sctp_stream_out_ext, prio_list); +} + +static bool sctp_sched_prio_unsched(struct sctp_stream_out_ext *soute) +{ + bool scheduled = false; + + if (!list_empty(&soute->prio_list)) { + struct sctp_stream_priorities *prio_head = soute->prio_head; + + /* Scheduled */ + scheduled = true; + + if (prio_head->next == soute) + /* Try to move to the next stream */ + sctp_sched_prio_next_stream(prio_head); + + list_del_init(&soute->prio_list); + + /* Also unsched the priority if this was the last stream */ + if (list_empty(&prio_head->active)) { + list_del_init(&prio_head->prio_sched); + /* If there is no stream left, clear next */ + prio_head->next = NULL; + } + } + + return scheduled; +} + +static void sctp_sched_prio_sched(struct sctp_stream *stream, + struct sctp_stream_out_ext *soute) +{ + struct sctp_stream_priorities *prio, *prio_head; + + prio_head = soute->prio_head; + + /* Nothing to do if already scheduled */ + if (!list_empty(&soute->prio_list)) + return; + + /* Schedule the stream. If there is a next, we schedule the new + * one before it, so it's the last in round robin order. + * If there isn't, we also have to schedule the priority. + */ + if (prio_head->next) { + list_add(&soute->prio_list, prio_head->next->prio_list.prev); + return; + } + + list_add(&soute->prio_list, &prio_head->active); + prio_head->next = soute; + + list_for_each_entry(prio, &stream->prio_list, prio_sched) { + if (prio->prio > prio_head->prio) { + list_add(&prio_head->prio_sched, prio->prio_sched.prev); + return; + } + } + + list_add_tail(&prio_head->prio_sched, &stream->prio_list); +} + +static int sctp_sched_prio_set(struct sctp_stream *stream, __u16 sid, + __u16 prio, gfp_t gfp) +{ + struct sctp_stream_out *sout = &stream->out[sid]; + struct sctp_stream_out_ext *soute = sout->ext; + struct sctp_stream_priorities *prio_head, *old; + bool reschedule = false; + int i; + + prio_head = sctp_sched_prio_get_head(stream, prio, gfp); + if (!prio_head) + return -ENOMEM; + + reschedule = sctp_sched_prio_unsched(soute); + old = soute->prio_head; + soute->prio_head = prio_head; + if (reschedule) + sctp_sched_prio_sched(stream, soute); + + if (!old) + /* Happens when we set the priority for the first time */ + return 0; + + for (i = 0; i < stream->outcnt; i++) { + soute = stream->out[i].ext; + if (soute && soute->prio_head == old) + /* It's still in use, nothing else to do here. */ + return 0; + } + + /* No hits, we are good to free it. */ + kfree(old); + + return 0; +} + +static int sctp_sched_prio_get(struct sctp_stream *stream, __u16 sid, + __u16 *value) +{ + *value = stream->out[sid].ext->prio_head->prio; + return 0; +} + +static int sctp_sched_prio_init(struct sctp_stream *stream) +{ + INIT_LIST_HEAD(&stream->prio_list); + + return 0; +} + +static int sctp_sched_prio_init_sid(struct sctp_stream *stream, __u16 sid, + gfp_t gfp) +{ + INIT_LIST_HEAD(&stream->out[sid].ext->prio_list); + return sctp_sched_prio_set(stream, sid, 0, gfp); +} + +static void sctp_sched_prio_free(struct sctp_stream *stream) +{ + struct sctp_stream_priorities *prio, *n; + LIST_HEAD(list); + int i; + + /* As we don't keep a list of priorities, to avoid multiple + * frees we have to do it in 3 steps: + * 1. unsched everyone, so the lists are free to use in 2. + * 2. build the list of the priorities + * 3. free the list + */ + sctp_sched_prio_unsched_all(stream); + for (i = 0; i < stream->outcnt; i++) { + if (!stream->out[i].ext) + continue; + prio = stream->out[i].ext->prio_head; + if (prio && list_empty(&prio->prio_sched)) + list_add(&prio->prio_sched, &list); + } + list_for_each_entry_safe(prio, n, &list, prio_sched) { + list_del_init(&prio->prio_sched); + kfree(prio); + } +} + +static void sctp_sched_prio_enqueue(struct sctp_outq *q, + struct sctp_datamsg *msg) +{ + struct sctp_stream *stream; + struct sctp_chunk *ch; + __u16 sid; + + ch = list_first_entry(&msg->chunks, struct sctp_chunk, frag_list); + sid = sctp_chunk_stream_no(ch); + stream = &q->asoc->stream; + sctp_sched_prio_sched(stream, stream->out[sid].ext); +} + +static struct sctp_chunk *sctp_sched_prio_dequeue(struct sctp_outq *q) +{ + struct sctp_stream *stream = &q->asoc->stream; + struct sctp_stream_priorities *prio; + struct sctp_stream_out_ext *soute; + struct sctp_chunk *ch = NULL; + + /* Bail out quickly if queue is empty */ + if (list_empty(&q->out_chunk_list)) + goto out; + + /* Find which chunk is next. It's easy, it's either the current + * one or the first chunk on the next active stream. + */ + if (stream->out_curr) { + soute = stream->out_curr->ext; + } else { + prio = list_entry(stream->prio_list.next, + struct sctp_stream_priorities, prio_sched); + soute = prio->next; + } + ch = list_entry(soute->outq.next, struct sctp_chunk, stream_list); + sctp_sched_dequeue_common(q, ch); + +out: + return ch; +} + +static void sctp_sched_prio_dequeue_done(struct sctp_outq *q, + struct sctp_chunk *ch) +{ + struct sctp_stream_priorities *prio; + struct sctp_stream_out_ext *soute; + __u16 sid; + + /* Last chunk on that msg, move to the next stream on + * this priority. + */ + sid = sctp_chunk_stream_no(ch); + soute = q->asoc->stream.out[sid].ext; + prio = soute->prio_head; + + sctp_sched_prio_next_stream(prio); + + if (list_empty(&soute->outq)) + sctp_sched_prio_unsched(soute); +} + +static void sctp_sched_prio_sched_all(struct sctp_stream *stream) +{ + struct sctp_association *asoc; + struct sctp_stream_out *sout; + struct sctp_chunk *ch; + + asoc = container_of(stream, struct sctp_association, stream); + list_for_each_entry(ch, &asoc->outqueue.out_chunk_list, list) { + __u16 sid; + + sid = sctp_chunk_stream_no(ch); + sout = &stream->out[sid]; + if (sout->ext) + sctp_sched_prio_sched(stream, sout->ext); + } +} + +static void sctp_sched_prio_unsched_all(struct sctp_stream *stream) +{ + struct sctp_stream_priorities *p, *tmp; + struct sctp_stream_out_ext *soute, *souttmp; + + list_for_each_entry_safe(p, tmp, &stream->prio_list, prio_sched) + list_for_each_entry_safe(soute, souttmp, &p->active, prio_list) + sctp_sched_prio_unsched(soute); +} + +struct sctp_sched_ops sctp_sched_prio = { + .set = sctp_sched_prio_set, + .get = sctp_sched_prio_get, + .init = sctp_sched_prio_init, + .init_sid = sctp_sched_prio_init_sid, + .free = sctp_sched_prio_free, + .enqueue = sctp_sched_prio_enqueue, + .dequeue = sctp_sched_prio_dequeue, + .dequeue_done = sctp_sched_prio_dequeue_done, + .sched_all = sctp_sched_prio_sched_all, + .unsched_all = sctp_sched_prio_unsched_all, +}; diff --git a/net/sctp/stream_sched_rr.c b/net/sctp/stream_sched_rr.c new file mode 100644 index 000000000000..7612a438c5b9 --- /dev/null +++ b/net/sctp/stream_sched_rr.c @@ -0,0 +1,201 @@ +/* SCTP kernel implementation + * (C) Copyright Red Hat Inc. 2017 + * + * This file is part of the SCTP kernel implementation + * + * These functions manipulate sctp stream queue/scheduling. + * + * This SCTP implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This SCTP implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, see + * <http://www.gnu.org/licenses/>. + * + * Please send any bug reports or fixes you make to the + * email addresched(es): + * lksctp developers <linux-sctp@vger.kernel.org> + * + * Written or modified by: + * Marcelo Ricardo Leitner <marcelo.leitner@gmail.com> + */ + +#include <linux/list.h> +#include <net/sctp/sctp.h> +#include <net/sctp/sm.h> +#include <net/sctp/stream_sched.h> + +/* Priority handling + * RFC DRAFT ndata section 3.2 + */ +static void sctp_sched_rr_unsched_all(struct sctp_stream *stream); + +static void sctp_sched_rr_next_stream(struct sctp_stream *stream) +{ + struct list_head *pos; + + pos = stream->rr_next->rr_list.next; + if (pos == &stream->rr_list) + pos = pos->next; + stream->rr_next = list_entry(pos, struct sctp_stream_out_ext, rr_list); +} + +static void sctp_sched_rr_unsched(struct sctp_stream *stream, + struct sctp_stream_out_ext *soute) +{ + if (stream->rr_next == soute) + /* Try to move to the next stream */ + sctp_sched_rr_next_stream(stream); + + list_del_init(&soute->rr_list); + + /* If we have no other stream queued, clear next */ + if (list_empty(&stream->rr_list)) + stream->rr_next = NULL; +} + +static void sctp_sched_rr_sched(struct sctp_stream *stream, + struct sctp_stream_out_ext *soute) +{ + if (!list_empty(&soute->rr_list)) + /* Already scheduled. */ + return; + + /* Schedule the stream */ + list_add_tail(&soute->rr_list, &stream->rr_list); + + if (!stream->rr_next) + stream->rr_next = soute; +} + +static int sctp_sched_rr_set(struct sctp_stream *stream, __u16 sid, + __u16 prio, gfp_t gfp) +{ + return 0; +} + +static int sctp_sched_rr_get(struct sctp_stream *stream, __u16 sid, + __u16 *value) +{ + return 0; +} + +static int sctp_sched_rr_init(struct sctp_stream *stream) +{ + INIT_LIST_HEAD(&stream->rr_list); + stream->rr_next = NULL; + + return 0; +} + +static int sctp_sched_rr_init_sid(struct sctp_stream *stream, __u16 sid, + gfp_t gfp) +{ + INIT_LIST_HEAD(&stream->out[sid].ext->rr_list); + + return 0; +} + +static void sctp_sched_rr_free(struct sctp_stream *stream) +{ + sctp_sched_rr_unsched_all(stream); +} + +static void sctp_sched_rr_enqueue(struct sctp_outq *q, + struct sctp_datamsg *msg) +{ + struct sctp_stream *stream; + struct sctp_chunk *ch; + __u16 sid; + + ch = list_first_entry(&msg->chunks, struct sctp_chunk, frag_list); + sid = sctp_chunk_stream_no(ch); + stream = &q->asoc->stream; + sctp_sched_rr_sched(stream, stream->out[sid].ext); +} + +static struct sctp_chunk *sctp_sched_rr_dequeue(struct sctp_outq *q) +{ + struct sctp_stream *stream = &q->asoc->stream; + struct sctp_stream_out_ext *soute; + struct sctp_chunk *ch = NULL; + + /* Bail out quickly if queue is empty */ + if (list_empty(&q->out_chunk_list)) + goto out; + + /* Find which chunk is next */ + if (stream->out_curr) + soute = stream->out_curr->ext; + else + soute = stream->rr_next; + ch = list_entry(soute->outq.next, struct sctp_chunk, stream_list); + + sctp_sched_dequeue_common(q, ch); + +out: + return ch; +} + +static void sctp_sched_rr_dequeue_done(struct sctp_outq *q, + struct sctp_chunk *ch) +{ + struct sctp_stream_out_ext *soute; + __u16 sid; + + /* Last chunk on that msg, move to the next stream */ + sid = sctp_chunk_stream_no(ch); + soute = q->asoc->stream.out[sid].ext; + + sctp_sched_rr_next_stream(&q->asoc->stream); + + if (list_empty(&soute->outq)) + sctp_sched_rr_unsched(&q->asoc->stream, soute); +} + +static void sctp_sched_rr_sched_all(struct sctp_stream *stream) +{ + struct sctp_association *asoc; + struct sctp_stream_out_ext *soute; + struct sctp_chunk *ch; + + asoc = container_of(stream, struct sctp_association, stream); + list_for_each_entry(ch, &asoc->outqueue.out_chunk_list, list) { + __u16 sid; + + sid = sctp_chunk_stream_no(ch); + soute = stream->out[sid].ext; + if (soute) + sctp_sched_rr_sched(stream, soute); + } +} + +static void sctp_sched_rr_unsched_all(struct sctp_stream *stream) +{ + struct sctp_stream_out_ext *soute, *tmp; + + list_for_each_entry_safe(soute, tmp, &stream->rr_list, rr_list) + sctp_sched_rr_unsched(stream, soute); +} + +struct sctp_sched_ops sctp_sched_rr = { + .set = sctp_sched_rr_set, + .get = sctp_sched_rr_get, + .init = sctp_sched_rr_init, + .init_sid = sctp_sched_rr_init_sid, + .free = sctp_sched_rr_free, + .enqueue = sctp_sched_rr_enqueue, + .dequeue = sctp_sched_rr_dequeue, + .dequeue_done = sctp_sched_rr_dequeue_done, + .sched_all = sctp_sched_rr_sched_all, + .unsched_all = sctp_sched_rr_unsched_all, +}; diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 8c6d24b2995d..745f145d4c4d 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -282,6 +282,7 @@ int smc_netinfo_by_tcpsk(struct socket *clcsock, __be32 *subnet, u8 *prefix_len) { struct dst_entry *dst = sk_dst_get(clcsock->sk); + struct in_device *in_dev; struct sockaddr_in addr; int rc = -ENOENT; int len; @@ -298,14 +299,17 @@ int smc_netinfo_by_tcpsk(struct socket *clcsock, /* get address to which the internal TCP socket is bound */ kernel_getsockname(clcsock, (struct sockaddr *)&addr, &len); /* analyze IPv4 specific data of net_device belonging to TCP socket */ - for_ifa(dst->dev->ip_ptr) { - if (ifa->ifa_address != addr.sin_addr.s_addr) + rcu_read_lock(); + in_dev = __in_dev_get_rcu(dst->dev); + for_ifa(in_dev) { + if (!inet_ifa_match(addr.sin_addr.s_addr, ifa)) continue; *prefix_len = inet_mask_len(ifa->ifa_mask); *subnet = ifa->ifa_address & ifa->ifa_mask; rc = 0; break; - } endfor_ifa(dst->dev->ip_ptr); + } endfor_ifa(in_dev); + rcu_read_unlock(); out_rel: dst_release(dst); @@ -509,7 +513,7 @@ decline_rdma: /* RDMA setup failed, switch back to TCP */ smc->use_fallback = true; if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) { - rc = smc_clc_send_decline(smc, reason_code, 0); + rc = smc_clc_send_decline(smc, reason_code); if (rc < sizeof(struct smc_clc_msg_decline)) goto out_err; } @@ -804,8 +808,6 @@ static void smc_listen_work(struct work_struct *work) rc = local_contact; if (rc == -ENOMEM) reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/ - else if (rc == -ENOLINK) - reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */ goto decline_rdma; } link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK]; @@ -899,7 +901,7 @@ decline_rdma: smc_conn_free(&new_smc->conn); new_smc->use_fallback = true; if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) { - rc = smc_clc_send_decline(new_smc, reason_code, 0); + rc = smc_clc_send_decline(new_smc, reason_code); if (rc < sizeof(struct smc_clc_msg_decline)) goto out_err; } diff --git a/net/smc/smc.h b/net/smc/smc.h index 6e44313e4467..0ccd6fa387ad 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -149,7 +149,7 @@ struct smc_connection { atomic_t sndbuf_space; /* remaining space in sndbuf */ u16 tx_cdc_seq; /* sequence # for CDC send */ spinlock_t send_lock; /* protect wr_sends */ - struct work_struct tx_work; /* retry of smc_cdc_msg_send */ + struct delayed_work tx_work; /* retry of smc_cdc_msg_send */ struct smc_host_cdc_msg local_rx_ctrl; /* filled during event_handl. * .prod cf. TCP rcv_nxt diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index a7294edbc221..5ef97e5a5f78 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -62,10 +62,12 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd, bh_unlock_sock(&smc->sk); } -int smc_cdc_get_free_slot(struct smc_link *link, +int smc_cdc_get_free_slot(struct smc_connection *conn, struct smc_wr_buf **wr_buf, struct smc_cdc_tx_pend **pend) { + struct smc_link *link = &conn->lgr->lnk[SMC_SINGLE_LINK]; + return smc_wr_tx_get_free_slot(link, smc_cdc_tx_handler, wr_buf, (struct smc_wr_tx_pend_priv **)pend); } @@ -118,8 +120,7 @@ int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn) struct smc_wr_buf *wr_buf; int rc; - rc = smc_cdc_get_free_slot(&conn->lgr->lnk[SMC_SINGLE_LINK], &wr_buf, - &pend); + rc = smc_cdc_get_free_slot(conn, &wr_buf, &pend); if (rc) return rc; diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h index 8e1d76f26007..56f883d1159c 100644 --- a/net/smc/smc_cdc.h +++ b/net/smc/smc_cdc.h @@ -206,7 +206,8 @@ static inline void smc_cdc_msg_to_host(struct smc_host_cdc_msg *local, struct smc_cdc_tx_pend; -int smc_cdc_get_free_slot(struct smc_link *link, struct smc_wr_buf **wr_buf, +int smc_cdc_get_free_slot(struct smc_connection *conn, + struct smc_wr_buf **wr_buf, struct smc_cdc_tx_pend **pend); void smc_cdc_tx_dismiss_slots(struct smc_connection *conn); int smc_cdc_msg_send(struct smc_connection *conn, struct smc_wr_buf *wr_buf, diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 3934913ab835..b7dd2743fb5c 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -95,9 +95,10 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, } if (clcm->type == SMC_CLC_DECLINE) { reason_code = SMC_CLC_DECL_REPLY; - if (ntohl(((struct smc_clc_msg_decline *)buf)->peer_diagnosis) - == SMC_CLC_DECL_SYNCERR) + if (((struct smc_clc_msg_decline *)buf)->hdr.flag) { smc->conn.lgr->sync_err = true; + smc_lgr_terminate(smc->conn.lgr); + } } out: @@ -105,8 +106,7 @@ out: } /* send CLC DECLINE message across internal TCP socket */ -int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info, - u8 out_of_sync) +int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info) { struct smc_clc_msg_decline dclc; struct msghdr msg; @@ -118,7 +118,7 @@ int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info, dclc.hdr.type = SMC_CLC_DECLINE; dclc.hdr.length = htons(sizeof(struct smc_clc_msg_decline)); dclc.hdr.version = SMC_CLC_V1; - dclc.hdr.flag = out_of_sync ? 1 : 0; + dclc.hdr.flag = (peer_diag_info == SMC_CLC_DECL_SYNCERR) ? 1 : 0; memcpy(dclc.id_for_peer, local_systemid, sizeof(local_systemid)); dclc.peer_diagnosis = htonl(peer_diag_info); memcpy(dclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index 13db8ce177c9..1c55414041d4 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -106,8 +106,7 @@ struct smc_ib_device; int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, u8 expected_type); -int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info, - u8 out_of_sync); +int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info); int smc_clc_send_proposal(struct smc_sock *smc, struct smc_ib_device *smcibdev, u8 ibport); int smc_clc_send_confirm(struct smc_sock *smc); diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index 3c2e166b5d22..f0d16fb825f7 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -174,15 +174,15 @@ int smc_close_active(struct smc_sock *smc) { struct smc_cdc_conn_state_flags *txflags = &smc->conn.local_tx_ctrl.conn_state_flags; - long timeout = SMC_MAX_STREAM_WAIT_TIMEOUT; struct smc_connection *conn = &smc->conn; struct sock *sk = &smc->sk; int old_state; + long timeout; int rc = 0; - if (sock_flag(sk, SOCK_LINGER) && - !(current->flags & PF_EXITING)) - timeout = sk->sk_lingertime; + timeout = current->flags & PF_EXITING ? + 0 : sock_flag(sk, SOCK_LINGER) ? + sk->sk_lingertime : SMC_MAX_STREAM_WAIT_TIMEOUT; again: old_state = sk->sk_state; @@ -208,7 +208,7 @@ again: case SMC_ACTIVE: smc_close_stream_wait(smc, timeout); release_sock(sk); - cancel_work_sync(&conn->tx_work); + cancel_delayed_work_sync(&conn->tx_work); lock_sock(sk); if (sk->sk_state == SMC_ACTIVE) { /* send close request */ @@ -234,7 +234,7 @@ again: if (!smc_cdc_rxed_any_close(conn)) smc_close_stream_wait(smc, timeout); release_sock(sk); - cancel_work_sync(&conn->tx_work); + cancel_delayed_work_sync(&conn->tx_work); lock_sock(sk); if (sk->sk_err != ECONNABORTED) { /* confirm close from peer */ @@ -263,7 +263,9 @@ again: /* peer sending PeerConnectionClosed will cause transition */ break; case SMC_PROCESSABORT: - cancel_work_sync(&conn->tx_work); + release_sock(sk); + cancel_delayed_work_sync(&conn->tx_work); + lock_sock(sk); smc_close_abort(conn); sk->sk_state = SMC_CLOSED; smc_close_wait_tx_pends(smc); @@ -411,13 +413,14 @@ void smc_close_sock_put_work(struct work_struct *work) int smc_close_shutdown_write(struct smc_sock *smc) { struct smc_connection *conn = &smc->conn; - long timeout = SMC_MAX_STREAM_WAIT_TIMEOUT; struct sock *sk = &smc->sk; int old_state; + long timeout; int rc = 0; - if (sock_flag(sk, SOCK_LINGER)) - timeout = sk->sk_lingertime; + timeout = current->flags & PF_EXITING ? + 0 : sock_flag(sk, SOCK_LINGER) ? + sk->sk_lingertime : SMC_MAX_STREAM_WAIT_TIMEOUT; again: old_state = sk->sk_state; @@ -425,7 +428,7 @@ again: case SMC_ACTIVE: smc_close_stream_wait(smc, timeout); release_sock(sk); - cancel_work_sync(&conn->tx_work); + cancel_delayed_work_sync(&conn->tx_work); lock_sock(sk); /* send close wr request */ rc = smc_close_wr(conn); @@ -439,7 +442,7 @@ again: if (!smc_cdc_rxed_any_close(conn)) smc_close_stream_wait(smc, timeout); release_sock(sk); - cancel_work_sync(&conn->tx_work); + cancel_delayed_work_sync(&conn->tx_work); lock_sock(sk); /* confirm close from peer */ rc = smc_close_wr(conn); diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 1a16d51e2330..20b66e79c5d6 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -25,8 +25,9 @@ #include "smc_cdc.h" #include "smc_close.h" -#define SMC_LGR_NUM_INCR 256 -#define SMC_LGR_FREE_DELAY (600 * HZ) +#define SMC_LGR_NUM_INCR 256 +#define SMC_LGR_FREE_DELAY_SERV (600 * HZ) +#define SMC_LGR_FREE_DELAY_CLNT (SMC_LGR_FREE_DELAY_SERV + 10) static u32 smc_lgr_num; /* unique link group number */ @@ -107,8 +108,15 @@ static void smc_lgr_unregister_conn(struct smc_connection *conn) __smc_lgr_unregister_conn(conn); } write_unlock_bh(&lgr->conns_lock); - if (reduced && !lgr->conns_num) - schedule_delayed_work(&lgr->free_work, SMC_LGR_FREE_DELAY); + if (!reduced || lgr->conns_num) + return; + /* client link group creation always follows the server link group + * creation. For client use a somewhat higher removal delay time, + * otherwise there is a risk of out-of-sync link groups. + */ + mod_delayed_work(system_wq, &lgr->free_work, + lgr->role == SMC_CLNT ? SMC_LGR_FREE_DELAY_CLNT : + SMC_LGR_FREE_DELAY_SERV); } static void smc_lgr_free_work(struct work_struct *work) diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 547e0e113b17..0b5852299158 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -380,6 +380,7 @@ static int smc_ib_fill_gid_and_mac(struct smc_ib_device *smcibdev, u8 ibport) ndev = smcibdev->ibdev->get_netdev(smcibdev->ibdev, ibport); if (ndev) { memcpy(&smcibdev->mac, ndev->dev_addr, ETH_ALEN); + dev_put(ndev); } else if (!rc) { memcpy(&smcibdev->mac[ibport - 1][0], &smcibdev->gid[ibport - 1].raw[8], 3); diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index 78f7af28ae4f..31f8453c25c5 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -181,8 +181,10 @@ static int smc_pnet_enter(struct smc_pnetentry *new_pnetelem) sizeof(new_pnetelem->ndev->name)) || smc_pnet_same_ibname(pnetelem, new_pnetelem->smcibdev->ibdev->name, - new_pnetelem->ib_port)) + new_pnetelem->ib_port)) { + dev_put(pnetelem->ndev); goto found; + } } list_add_tail(&new_pnetelem->list, &smc_pnettable.pnetlist); rc = 0; diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index b17a333e9bb0..3e631ae4b6b6 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -148,6 +148,8 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len, read_done = sock_intr_errno(timeo); break; } + if (!timeo) + return -EAGAIN; } if (!atomic_read(&conn->bytes_to_rcv)) { diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 3c656beb8820..ec49bc3c3283 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -24,6 +24,8 @@ #include "smc_cdc.h" #include "smc_tx.h" +#define SMC_TX_WORK_DELAY HZ + /***************************** sndbuf producer *******************************/ /* callback implementation for sk.sk_write_space() @@ -394,8 +396,7 @@ int smc_tx_sndbuf_nonempty(struct smc_connection *conn) int rc; spin_lock_bh(&conn->send_lock); - rc = smc_cdc_get_free_slot(&conn->lgr->lnk[SMC_SINGLE_LINK], &wr_buf, - &pend); + rc = smc_cdc_get_free_slot(conn, &wr_buf, &pend); if (rc < 0) { if (rc == -EBUSY) { struct smc_sock *smc = @@ -406,7 +407,8 @@ int smc_tx_sndbuf_nonempty(struct smc_connection *conn) goto out_unlock; } rc = 0; - schedule_work(&conn->tx_work); + schedule_delayed_work(&conn->tx_work, + SMC_TX_WORK_DELAY); } goto out_unlock; } @@ -430,7 +432,7 @@ out_unlock: */ static void smc_tx_work(struct work_struct *work) { - struct smc_connection *conn = container_of(work, + struct smc_connection *conn = container_of(to_delayed_work(work), struct smc_connection, tx_work); struct smc_sock *smc = container_of(conn, struct smc_sock, conn); @@ -463,12 +465,12 @@ void smc_tx_consumer_update(struct smc_connection *conn) ((to_confirm > conn->rmbe_update_limit) && ((to_confirm > (conn->rmbe_size / 2)) || conn->local_rx_ctrl.prod_flags.write_blocked))) { - rc = smc_cdc_get_free_slot(&conn->lgr->lnk[SMC_SINGLE_LINK], - &wr_buf, &pend); + rc = smc_cdc_get_free_slot(conn, &wr_buf, &pend); if (!rc) rc = smc_cdc_msg_send(conn, wr_buf, pend); if (rc < 0) { - schedule_work(&conn->tx_work); + schedule_delayed_work(&conn->tx_work, + SMC_TX_WORK_DELAY); return; } smc_curs_write(&conn->rx_curs_confirmed, @@ -487,6 +489,6 @@ void smc_tx_consumer_update(struct smc_connection *conn) void smc_tx_init(struct smc_sock *smc) { smc->sk.sk_write_space = smc_tx_write_space; - INIT_WORK(&smc->conn.tx_work, smc_tx_work); + INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work); spin_lock_init(&smc->conn.send_lock); } diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index ab56bda66783..525d91e0d57e 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -244,7 +244,7 @@ int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv) int rc; ib_req_notify_cq(link->smcibdev->roce_cq_send, - IB_CQ_SOLICITED_MASK | IB_CQ_REPORT_MISSED_EVENTS); + IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); pend = container_of(priv, struct smc_wr_tx_pend, priv); rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx], &failed_wr); diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 5a936a6a31a3..df062e086bdb 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -401,7 +401,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, if (unlikely(n != mw->mw_nents)) goto out_mapmr_err; - dprintk("RPC: %s: Using frmr %p to map %u segments (%u bytes)\n", + dprintk("RPC: %s: Using frmr %p to map %u segments (%llu bytes)\n", __func__, frmr, mw->mw_nents, mr->length); key = (u8)(mr->rkey & 0x000000FF); diff --git a/net/tipc/msg.c b/net/tipc/msg.c index 6ef379f004ac..121e59a1d0e7 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -551,7 +551,7 @@ bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err) return false; if (msg_errcode(msg)) return false; - *err = -TIPC_ERR_NO_NAME; + *err = TIPC_ERR_NO_NAME; if (skb_linearize(skb)) return false; msg = buf_msg(skb); diff --git a/net/vmw_vsock/Kconfig b/net/vmw_vsock/Kconfig index a24369d175fd..970f96489fe7 100644 --- a/net/vmw_vsock/Kconfig +++ b/net/vmw_vsock/Kconfig @@ -15,6 +15,16 @@ config VSOCKETS To compile this driver as a module, choose M here: the module will be called vsock. If unsure, say N. +config VSOCKETS_DIAG + tristate "Virtual Sockets monitoring interface" + depends on VSOCKETS + default y + help + Support for PF_VSOCK sockets monitoring interface used by the ss tool. + If unsure, say Y. + + Enable this module so userspace applications can query open sockets. + config VMWARE_VMCI_VSOCKETS tristate "VMware VMCI transport for Virtual Sockets" depends on VSOCKETS && VMWARE_VMCI diff --git a/net/vmw_vsock/Makefile b/net/vmw_vsock/Makefile index e63d574234a9..64afc06805da 100644 --- a/net/vmw_vsock/Makefile +++ b/net/vmw_vsock/Makefile @@ -1,4 +1,5 @@ obj-$(CONFIG_VSOCKETS) += vsock.o +obj-$(CONFIG_VSOCKETS_DIAG) += vsock_diag.o obj-$(CONFIG_VMWARE_VMCI_VSOCKETS) += vmw_vsock_vmci_transport.o obj-$(CONFIG_VIRTIO_VSOCKETS) += vmw_vsock_virtio_transport.o obj-$(CONFIG_VIRTIO_VSOCKETS_COMMON) += vmw_vsock_virtio_transport_common.o @@ -6,6 +7,8 @@ obj-$(CONFIG_HYPERV_VSOCKETS) += hv_sock.o vsock-y += af_vsock.o af_vsock_tap.o vsock_addr.o +vsock_diag-y += diag.o + vmw_vsock_vmci_transport-y += vmci_transport.o vmci_transport_notify.o \ vmci_transport_notify_qstate.o diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index dfc8c51e4d74..98359c19522f 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -36,7 +36,7 @@ * not support simultaneous connects (two "client" sockets connecting). * * - "Server" sockets are referred to as listener sockets throughout this - * implementation because they are in the VSOCK_SS_LISTEN state. When a + * implementation because they are in the TCP_LISTEN state. When a * connection request is received (the second kind of socket mentioned above), * we create a new socket and refer to it as a pending socket. These pending * sockets are placed on the pending connection list of the listener socket. @@ -82,6 +82,15 @@ * argument, we must ensure the reference count is increased to ensure the * socket isn't freed before the function is run; the deferred function will * then drop the reference. + * + * - sk->sk_state uses the TCP state constants because they are widely used by + * other address families and exposed to userspace tools like ss(8): + * + * TCP_CLOSE - unconnected + * TCP_SYN_SENT - connecting + * TCP_ESTABLISHED - connected + * TCP_CLOSING - disconnecting + * TCP_LISTEN - listening */ #include <linux/types.h> @@ -153,7 +162,6 @@ EXPORT_SYMBOL_GPL(vm_sockets_get_local_cid); * vsock_bind_table[VSOCK_HASH_SIZE] is for unbound sockets. The hash function * mods with VSOCK_HASH_SIZE to ensure this. */ -#define VSOCK_HASH_SIZE 251 #define MAX_PORT_RETRIES 24 #define VSOCK_HASH(addr) ((addr)->svm_port % VSOCK_HASH_SIZE) @@ -168,9 +176,12 @@ EXPORT_SYMBOL_GPL(vm_sockets_get_local_cid); #define vsock_connected_sockets_vsk(vsk) \ vsock_connected_sockets(&(vsk)->remote_addr, &(vsk)->local_addr) -static struct list_head vsock_bind_table[VSOCK_HASH_SIZE + 1]; -static struct list_head vsock_connected_table[VSOCK_HASH_SIZE]; -static DEFINE_SPINLOCK(vsock_table_lock); +struct list_head vsock_bind_table[VSOCK_HASH_SIZE + 1]; +EXPORT_SYMBOL_GPL(vsock_bind_table); +struct list_head vsock_connected_table[VSOCK_HASH_SIZE]; +EXPORT_SYMBOL_GPL(vsock_connected_table); +DEFINE_SPINLOCK(vsock_table_lock); +EXPORT_SYMBOL_GPL(vsock_table_lock); /* Autobind this socket to the local address if necessary. */ static int vsock_auto_bind(struct vsock_sock *vsk) @@ -248,16 +259,6 @@ static struct sock *__vsock_find_connected_socket(struct sockaddr_vm *src, return NULL; } -static bool __vsock_in_bound_table(struct vsock_sock *vsk) -{ - return !list_empty(&vsk->bound_table); -} - -static bool __vsock_in_connected_table(struct vsock_sock *vsk) -{ - return !list_empty(&vsk->connected_table); -} - static void vsock_insert_unbound(struct vsock_sock *vsk) { spin_lock_bh(&vsock_table_lock); @@ -485,7 +486,7 @@ void vsock_pending_work(struct work_struct *work) if (vsock_in_connected_table(vsk)) vsock_remove_connected(vsk); - sk->sk_state = SS_FREE; + sk->sk_state = TCP_CLOSE; out: release_sock(sk); @@ -625,7 +626,6 @@ struct sock *__vsock_create(struct net *net, sk->sk_destruct = vsock_sk_destruct; sk->sk_backlog_rcv = vsock_queue_rcv_skb; - sk->sk_state = 0; sock_reset_flag(sk, SOCK_DONE); INIT_LIST_HEAD(&vsk->bound_table); @@ -899,7 +899,7 @@ static unsigned int vsock_poll(struct file *file, struct socket *sock, /* Listening sockets that have connections in their accept * queue can be read. */ - if (sk->sk_state == VSOCK_SS_LISTEN + if (sk->sk_state == TCP_LISTEN && !vsock_is_accept_queue_empty(sk)) mask |= POLLIN | POLLRDNORM; @@ -928,7 +928,7 @@ static unsigned int vsock_poll(struct file *file, struct socket *sock, } /* Connected sockets that can produce data can be written. */ - if (sk->sk_state == SS_CONNECTED) { + if (sk->sk_state == TCP_ESTABLISHED) { if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { bool space_avail_now = false; int ret = transport->notify_poll_out( @@ -950,7 +950,7 @@ static unsigned int vsock_poll(struct file *file, struct socket *sock, * POLLOUT|POLLWRNORM when peer is closed and nothing to read, * but local send is not shutdown. */ - if (sk->sk_state == SS_UNCONNECTED) { + if (sk->sk_state == TCP_CLOSE) { if (!(sk->sk_shutdown & SEND_SHUTDOWN)) mask |= POLLOUT | POLLWRNORM; @@ -1120,9 +1120,9 @@ static void vsock_connect_timeout(struct work_struct *work) sk = sk_vsock(vsk); lock_sock(sk); - if (sk->sk_state == SS_CONNECTING && + if (sk->sk_state == TCP_SYN_SENT && (sk->sk_shutdown != SHUTDOWN_MASK)) { - sk->sk_state = SS_UNCONNECTED; + sk->sk_state = TCP_CLOSE; sk->sk_err = ETIMEDOUT; sk->sk_error_report(sk); cancel = 1; @@ -1168,7 +1168,7 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr, err = -EALREADY; break; default: - if ((sk->sk_state == VSOCK_SS_LISTEN) || + if ((sk->sk_state == TCP_LISTEN) || vsock_addr_cast(addr, addr_len, &remote_addr) != 0) { err = -EINVAL; goto out; @@ -1191,7 +1191,7 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr, if (err) goto out; - sk->sk_state = SS_CONNECTING; + sk->sk_state = TCP_SYN_SENT; err = transport->connect(vsk); if (err < 0) @@ -1211,7 +1211,7 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr, timeout = vsk->connect_timeout; prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); - while (sk->sk_state != SS_CONNECTED && sk->sk_err == 0) { + while (sk->sk_state != TCP_ESTABLISHED && sk->sk_err == 0) { if (flags & O_NONBLOCK) { /* If we're not going to block, we schedule a timeout * function to generate a timeout on the connection @@ -1234,13 +1234,13 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr, if (signal_pending(current)) { err = sock_intr_errno(timeout); - sk->sk_state = SS_UNCONNECTED; + sk->sk_state = TCP_CLOSE; sock->state = SS_UNCONNECTED; vsock_transport_cancel_pkt(vsk); goto out_wait; } else if (timeout == 0) { err = -ETIMEDOUT; - sk->sk_state = SS_UNCONNECTED; + sk->sk_state = TCP_CLOSE; sock->state = SS_UNCONNECTED; vsock_transport_cancel_pkt(vsk); goto out_wait; @@ -1251,7 +1251,7 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr, if (sk->sk_err) { err = -sk->sk_err; - sk->sk_state = SS_UNCONNECTED; + sk->sk_state = TCP_CLOSE; sock->state = SS_UNCONNECTED; } else { err = 0; @@ -1284,7 +1284,7 @@ static int vsock_accept(struct socket *sock, struct socket *newsock, int flags, goto out; } - if (listener->sk_state != VSOCK_SS_LISTEN) { + if (listener->sk_state != TCP_LISTEN) { err = -EINVAL; goto out; } @@ -1374,7 +1374,7 @@ static int vsock_listen(struct socket *sock, int backlog) } sk->sk_max_ack_backlog = backlog; - sk->sk_state = VSOCK_SS_LISTEN; + sk->sk_state = TCP_LISTEN; err = 0; @@ -1554,7 +1554,7 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, /* Callers should not provide a destination with stream sockets. */ if (msg->msg_namelen) { - err = sk->sk_state == SS_CONNECTED ? -EISCONN : -EOPNOTSUPP; + err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; goto out; } @@ -1565,7 +1565,7 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, goto out; } - if (sk->sk_state != SS_CONNECTED || + if (sk->sk_state != TCP_ESTABLISHED || !vsock_addr_bound(&vsk->local_addr)) { err = -ENOTCONN; goto out; @@ -1689,7 +1689,7 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, lock_sock(sk); - if (sk->sk_state != SS_CONNECTED) { + if (sk->sk_state != TCP_ESTABLISHED) { /* Recvmsg is supposed to return 0 if a peer performs an * orderly shutdown. Differentiate between that case and when a * peer has not connected or a local shutdown occured with the diff --git a/net/vmw_vsock/diag.c b/net/vmw_vsock/diag.c new file mode 100644 index 000000000000..31b567652250 --- /dev/null +++ b/net/vmw_vsock/diag.c @@ -0,0 +1,186 @@ +/* + * vsock sock_diag(7) module + * + * Copyright (C) 2017 Red Hat, Inc. + * Author: Stefan Hajnoczi <stefanha@redhat.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation version 2 and no later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include <linux/module.h> +#include <linux/sock_diag.h> +#include <linux/vm_sockets_diag.h> +#include <net/af_vsock.h> + +static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, + u32 portid, u32 seq, u32 flags) +{ + struct vsock_sock *vsk = vsock_sk(sk); + struct vsock_diag_msg *rep; + struct nlmsghdr *nlh; + + nlh = nlmsg_put(skb, portid, seq, SOCK_DIAG_BY_FAMILY, sizeof(*rep), + flags); + if (!nlh) + return -EMSGSIZE; + + rep = nlmsg_data(nlh); + rep->vdiag_family = AF_VSOCK; + + /* Lock order dictates that sk_lock is acquired before + * vsock_table_lock, so we cannot lock here. Simply don't take + * sk_lock; sk is guaranteed to stay alive since vsock_table_lock is + * held. + */ + rep->vdiag_type = sk->sk_type; + rep->vdiag_state = sk->sk_state; + rep->vdiag_shutdown = sk->sk_shutdown; + rep->vdiag_src_cid = vsk->local_addr.svm_cid; + rep->vdiag_src_port = vsk->local_addr.svm_port; + rep->vdiag_dst_cid = vsk->remote_addr.svm_cid; + rep->vdiag_dst_port = vsk->remote_addr.svm_port; + rep->vdiag_ino = sock_i_ino(sk); + + sock_diag_save_cookie(sk, rep->vdiag_cookie); + + return 0; +} + +static int vsock_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct vsock_diag_req *req; + struct vsock_sock *vsk; + unsigned int bucket; + unsigned int last_i; + unsigned int table; + struct net *net; + unsigned int i; + + req = nlmsg_data(cb->nlh); + net = sock_net(skb->sk); + + /* State saved between calls: */ + table = cb->args[0]; + bucket = cb->args[1]; + i = last_i = cb->args[2]; + + /* TODO VMCI pending sockets? */ + + spin_lock_bh(&vsock_table_lock); + + /* Bind table (locally created sockets) */ + if (table == 0) { + while (bucket < ARRAY_SIZE(vsock_bind_table)) { + struct list_head *head = &vsock_bind_table[bucket]; + + i = 0; + list_for_each_entry(vsk, head, bound_table) { + struct sock *sk = sk_vsock(vsk); + + if (!net_eq(sock_net(sk), net)) + continue; + if (i < last_i) + goto next_bind; + if (!(req->vdiag_states & (1 << sk->sk_state))) + goto next_bind; + if (sk_diag_fill(sk, skb, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI) < 0) + goto done; +next_bind: + i++; + } + last_i = 0; + bucket++; + } + + table++; + bucket = 0; + } + + /* Connected table (accepted connections) */ + while (bucket < ARRAY_SIZE(vsock_connected_table)) { + struct list_head *head = &vsock_connected_table[bucket]; + + i = 0; + list_for_each_entry(vsk, head, connected_table) { + struct sock *sk = sk_vsock(vsk); + + /* Skip sockets we've already seen above */ + if (__vsock_in_bound_table(vsk)) + continue; + + if (!net_eq(sock_net(sk), net)) + continue; + if (i < last_i) + goto next_connected; + if (!(req->vdiag_states & (1 << sk->sk_state))) + goto next_connected; + if (sk_diag_fill(sk, skb, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI) < 0) + goto done; +next_connected: + i++; + } + last_i = 0; + bucket++; + } + +done: + spin_unlock_bh(&vsock_table_lock); + + cb->args[0] = table; + cb->args[1] = bucket; + cb->args[2] = i; + + return skb->len; +} + +static int vsock_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) +{ + int hdrlen = sizeof(struct vsock_diag_req); + struct net *net = sock_net(skb->sk); + + if (nlmsg_len(h) < hdrlen) + return -EINVAL; + + if (h->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .dump = vsock_diag_dump, + }; + return netlink_dump_start(net->diag_nlsk, skb, h, &c); + } + + return -EOPNOTSUPP; +} + +static const struct sock_diag_handler vsock_diag_handler = { + .family = AF_VSOCK, + .dump = vsock_diag_handler_dump, +}; + +static int __init vsock_diag_init(void) +{ + return sock_diag_register(&vsock_diag_handler); +} + +static void __exit vsock_diag_exit(void) +{ + sock_diag_unregister(&vsock_diag_handler); +} + +module_init(vsock_diag_init); +module_exit(vsock_diag_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, + 40 /* AF_VSOCK */); diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c index 14ed5a344cdf..bbac023e70d1 100644 --- a/net/vmw_vsock/hyperv_transport.c +++ b/net/vmw_vsock/hyperv_transport.c @@ -310,7 +310,7 @@ static void hvs_close_connection(struct vmbus_channel *chan) struct sock *sk = get_per_channel_state(chan); struct vsock_sock *vsk = vsock_sk(sk); - sk->sk_state = SS_UNCONNECTED; + sk->sk_state = TCP_CLOSE; sock_set_flag(sk, SOCK_DONE); vsk->peer_shutdown |= SEND_SHUTDOWN | RCV_SHUTDOWN; @@ -344,8 +344,8 @@ static void hvs_open_connection(struct vmbus_channel *chan) if (!sk) return; - if ((conn_from_host && sk->sk_state != VSOCK_SS_LISTEN) || - (!conn_from_host && sk->sk_state != SS_CONNECTING)) + if ((conn_from_host && sk->sk_state != TCP_LISTEN) || + (!conn_from_host && sk->sk_state != TCP_SYN_SENT)) goto out; if (conn_from_host) { @@ -357,7 +357,7 @@ static void hvs_open_connection(struct vmbus_channel *chan) if (!new) goto out; - new->sk_state = SS_CONNECTING; + new->sk_state = TCP_SYN_SENT; vnew = vsock_sk(new); hvs_new = vnew->trans; hvs_new->chan = chan; @@ -384,7 +384,7 @@ static void hvs_open_connection(struct vmbus_channel *chan) vmbus_set_chn_rescind_callback(chan, hvs_close_connection); if (conn_from_host) { - new->sk_state = SS_CONNECTED; + new->sk_state = TCP_ESTABLISHED; sk->sk_ack_backlog++; hvs_addr_init(&vnew->local_addr, if_type); @@ -399,7 +399,7 @@ static void hvs_open_connection(struct vmbus_channel *chan) vsock_enqueue_accept(sk, new); release_sock(sk); } else { - sk->sk_state = SS_CONNECTED; + sk->sk_state = TCP_ESTABLISHED; sk->sk_socket->state = SS_CONNECTED; vsock_insert_connected(vsock_sk(sk)); diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index 403d86e80162..8e03bd3f3668 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -414,7 +414,7 @@ static void virtio_vsock_event_fill(struct virtio_vsock *vsock) static void virtio_vsock_reset_sock(struct sock *sk) { lock_sock(sk); - sk->sk_state = SS_UNCONNECTED; + sk->sk_state = TCP_CLOSE; sk->sk_err = ECONNRESET; sk->sk_error_report(sk); release_sock(sk); diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index edba7ab97563..3ae3a33da70b 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -708,7 +708,7 @@ static void virtio_transport_do_close(struct vsock_sock *vsk, sock_set_flag(sk, SOCK_DONE); vsk->peer_shutdown = SHUTDOWN_MASK; if (vsock_stream_has_data(vsk) <= 0) - sk->sk_state = SS_DISCONNECTING; + sk->sk_state = TCP_CLOSING; sk->sk_state_change(sk); if (vsk->close_work_scheduled && @@ -748,8 +748,8 @@ static bool virtio_transport_close(struct vsock_sock *vsk) { struct sock *sk = &vsk->sk; - if (!(sk->sk_state == SS_CONNECTED || - sk->sk_state == SS_DISCONNECTING)) + if (!(sk->sk_state == TCP_ESTABLISHED || + sk->sk_state == TCP_CLOSING)) return true; /* Already received SHUTDOWN from peer, reply with RST */ @@ -801,7 +801,7 @@ virtio_transport_recv_connecting(struct sock *sk, switch (le16_to_cpu(pkt->hdr.op)) { case VIRTIO_VSOCK_OP_RESPONSE: - sk->sk_state = SS_CONNECTED; + sk->sk_state = TCP_ESTABLISHED; sk->sk_socket->state = SS_CONNECTED; vsock_insert_connected(vsk); sk->sk_state_change(sk); @@ -821,7 +821,7 @@ virtio_transport_recv_connecting(struct sock *sk, destroy: virtio_transport_reset(vsk, pkt); - sk->sk_state = SS_UNCONNECTED; + sk->sk_state = TCP_CLOSE; sk->sk_err = skerr; sk->sk_error_report(sk); return err; @@ -857,7 +857,7 @@ virtio_transport_recv_connected(struct sock *sk, vsk->peer_shutdown |= SEND_SHUTDOWN; if (vsk->peer_shutdown == SHUTDOWN_MASK && vsock_stream_has_data(vsk) <= 0) - sk->sk_state = SS_DISCONNECTING; + sk->sk_state = TCP_CLOSING; if (le32_to_cpu(pkt->hdr.flags)) sk->sk_state_change(sk); break; @@ -928,7 +928,7 @@ virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt) lock_sock_nested(child, SINGLE_DEPTH_NESTING); - child->sk_state = SS_CONNECTED; + child->sk_state = TCP_ESTABLISHED; vchild = vsock_sk(child); vsock_addr_init(&vchild->local_addr, le64_to_cpu(pkt->hdr.dst_cid), @@ -1016,18 +1016,18 @@ void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt) sk->sk_write_space(sk); switch (sk->sk_state) { - case VSOCK_SS_LISTEN: + case TCP_LISTEN: virtio_transport_recv_listen(sk, pkt); virtio_transport_free_pkt(pkt); break; - case SS_CONNECTING: + case TCP_SYN_SENT: virtio_transport_recv_connecting(sk, pkt); virtio_transport_free_pkt(pkt); break; - case SS_CONNECTED: + case TCP_ESTABLISHED: virtio_transport_recv_connected(sk, pkt); break; - case SS_DISCONNECTING: + case TCP_CLOSING: virtio_transport_recv_disconnecting(sk, pkt); virtio_transport_free_pkt(pkt); break; diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c index 10ae7823a19d..391775e3575c 100644 --- a/net/vmw_vsock/vmci_transport.c +++ b/net/vmw_vsock/vmci_transport.c @@ -21,7 +21,6 @@ #include <linux/kernel.h> #include <linux/kmod.h> #include <linux/list.h> -#include <linux/miscdevice.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/net.h> @@ -743,7 +742,7 @@ static int vmci_transport_recv_stream_cb(void *data, struct vmci_datagram *dg) /* The local context ID may be out of date, update it. */ vsk->local_addr.svm_cid = dst.svm_cid; - if (sk->sk_state == SS_CONNECTED) + if (sk->sk_state == TCP_ESTABLISHED) vmci_trans(vsk)->notify_ops->handle_notify_pkt( sk, pkt, true, &dst, &src, &bh_process_pkt); @@ -801,7 +800,9 @@ static void vmci_transport_handle_detach(struct sock *sk) * left in our consume queue. */ if (vsock_stream_has_data(vsk) <= 0) { - if (sk->sk_state == SS_CONNECTING) { + sk->sk_state = TCP_CLOSE; + + if (sk->sk_state == TCP_SYN_SENT) { /* The peer may detach from a queue pair while * we are still in the connecting state, i.e., * if the peer VM is killed after attaching to @@ -810,12 +811,10 @@ static void vmci_transport_handle_detach(struct sock *sk) * event like a reset. */ - sk->sk_state = SS_UNCONNECTED; sk->sk_err = ECONNRESET; sk->sk_error_report(sk); return; } - sk->sk_state = SS_UNCONNECTED; } sk->sk_state_change(sk); } @@ -883,17 +882,17 @@ static void vmci_transport_recv_pkt_work(struct work_struct *work) vsock_sk(sk)->local_addr.svm_cid = pkt->dg.dst.context; switch (sk->sk_state) { - case VSOCK_SS_LISTEN: + case TCP_LISTEN: vmci_transport_recv_listen(sk, pkt); break; - case SS_CONNECTING: + case TCP_SYN_SENT: /* Processing of pending connections for servers goes through * the listening socket, so see vmci_transport_recv_listen() * for that path. */ vmci_transport_recv_connecting_client(sk, pkt); break; - case SS_CONNECTED: + case TCP_ESTABLISHED: vmci_transport_recv_connected(sk, pkt); break; default: @@ -942,7 +941,7 @@ static int vmci_transport_recv_listen(struct sock *sk, vsock_sk(pending)->local_addr.svm_cid = pkt->dg.dst.context; switch (pending->sk_state) { - case SS_CONNECTING: + case TCP_SYN_SENT: err = vmci_transport_recv_connecting_server(sk, pending, pkt); @@ -1072,7 +1071,7 @@ static int vmci_transport_recv_listen(struct sock *sk, vsock_add_pending(sk, pending); sk->sk_ack_backlog++; - pending->sk_state = SS_CONNECTING; + pending->sk_state = TCP_SYN_SENT; vmci_trans(vpending)->produce_size = vmci_trans(vpending)->consume_size = qp_size; vmci_trans(vpending)->queue_pair_size = qp_size; @@ -1197,11 +1196,11 @@ vmci_transport_recv_connecting_server(struct sock *listener, * the socket will be valid until it is removed from the queue. * * If we fail sending the attach below, we remove the socket from the - * connected list and move the socket to SS_UNCONNECTED before + * connected list and move the socket to TCP_CLOSE before * releasing the lock, so a pending slow path processing of an incoming * packet will not see the socket in the connected state in that case. */ - pending->sk_state = SS_CONNECTED; + pending->sk_state = TCP_ESTABLISHED; vsock_insert_connected(vpending); @@ -1232,7 +1231,7 @@ vmci_transport_recv_connecting_server(struct sock *listener, destroy: pending->sk_err = skerr; - pending->sk_state = SS_UNCONNECTED; + pending->sk_state = TCP_CLOSE; /* As long as we drop our reference, all necessary cleanup will handle * when the cleanup function drops its reference and our destruct * implementation is called. Note that since the listen handler will @@ -1270,7 +1269,7 @@ vmci_transport_recv_connecting_client(struct sock *sk, * accounting (it can already be found since it's in the bound * table). */ - sk->sk_state = SS_CONNECTED; + sk->sk_state = TCP_ESTABLISHED; sk->sk_socket->state = SS_CONNECTED; vsock_insert_connected(vsk); sk->sk_state_change(sk); @@ -1338,7 +1337,7 @@ vmci_transport_recv_connecting_client(struct sock *sk, destroy: vmci_transport_send_reset(sk, pkt); - sk->sk_state = SS_UNCONNECTED; + sk->sk_state = TCP_CLOSE; sk->sk_err = skerr; sk->sk_error_report(sk); return err; @@ -1526,7 +1525,7 @@ static int vmci_transport_recv_connected(struct sock *sk, sock_set_flag(sk, SOCK_DONE); vsk->peer_shutdown = SHUTDOWN_MASK; if (vsock_stream_has_data(vsk) <= 0) - sk->sk_state = SS_DISCONNECTING; + sk->sk_state = TCP_CLOSING; sk->sk_state_change(sk); break; @@ -1790,7 +1789,7 @@ static int vmci_transport_connect(struct vsock_sock *vsk) err = vmci_transport_send_conn_request( sk, vmci_trans(vsk)->queue_pair_size); if (err < 0) { - sk->sk_state = SS_UNCONNECTED; + sk->sk_state = TCP_CLOSE; return err; } } else { @@ -1800,7 +1799,7 @@ static int vmci_transport_connect(struct vsock_sock *vsk) sk, vmci_trans(vsk)->queue_pair_size, supported_proto_versions); if (err < 0) { - sk->sk_state = SS_UNCONNECTED; + sk->sk_state = TCP_CLOSE; return err; } diff --git a/net/vmw_vsock/vmci_transport_notify.c b/net/vmw_vsock/vmci_transport_notify.c index 1406db4d97d1..41fb427f150a 100644 --- a/net/vmw_vsock/vmci_transport_notify.c +++ b/net/vmw_vsock/vmci_transport_notify.c @@ -355,7 +355,7 @@ vmci_transport_notify_pkt_poll_in(struct sock *sk, * queue. Ask for notifications when there is something to * read. */ - if (sk->sk_state == SS_CONNECTED) { + if (sk->sk_state == TCP_ESTABLISHED) { if (!send_waiting_read(sk, 1)) return -1; diff --git a/net/vmw_vsock/vmci_transport_notify_qstate.c b/net/vmw_vsock/vmci_transport_notify_qstate.c index f3a0afc46208..0cc84f2bb05e 100644 --- a/net/vmw_vsock/vmci_transport_notify_qstate.c +++ b/net/vmw_vsock/vmci_transport_notify_qstate.c @@ -176,7 +176,7 @@ vmci_transport_notify_pkt_poll_in(struct sock *sk, * queue. Ask for notifications when there is something to * read. */ - if (sk->sk_state == SS_CONNECTED) + if (sk->sk_state == TCP_ESTABLISHED) vsock_block_update_write_window(sk); *data_ready_now = false; } diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 90e212db6889..5129342151e6 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -10021,6 +10021,9 @@ static int nl80211_join_mesh(struct sk_buff *skb, struct genl_info *info) if (err) return err; + if (!setup.chandef.chan) + return -EINVAL; + err = validate_beacon_tx_rate(rdev, setup.chandef.chan->band, &setup.beacon_rate); if (err) @@ -10937,6 +10940,9 @@ static int nl80211_set_rekey_data(struct sk_buff *skb, struct genl_info *info) if (err) return err; + if (!tb[NL80211_REKEY_DATA_REPLAY_CTR] || !tb[NL80211_REKEY_DATA_KEK] || + !tb[NL80211_REKEY_DATA_KCK]) + return -EINVAL; if (nla_len(tb[NL80211_REKEY_DATA_REPLAY_CTR]) != NL80211_REPLAY_CTR_LEN) return -ERANGE; if (nla_len(tb[NL80211_REKEY_DATA_KEK]) != NL80211_KEK_LEN) |