summaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/8021q/vlan.c13
-rw-r--r--net/batman-adv/Kconfig8
-rw-r--r--net/batman-adv/bat_iv_ogm.h6
-rw-r--r--net/batman-adv/bat_v_ogm.h6
-rw-r--r--net/batman-adv/bridge_loop_avoidance.c2
-rw-r--r--net/batman-adv/debugfs.c2
-rw-r--r--net/batman-adv/originator.c17
-rw-r--r--net/batman-adv/types.h7
-rw-r--r--net/core/dev.c828
-rw-r--r--net/core/devlink.c1322
-rw-r--r--net/core/ethtool.c1
-rw-r--r--net/core/filter.c3
-rw-r--r--net/core/flow_dissector.c48
-rw-r--r--net/core/neighbour.c3
-rw-r--r--net/core/net-sysfs.c123
-rw-r--r--net/core/pktgen.c10
-rw-r--r--net/core/rtnetlink.c83
-rw-r--r--net/core/skbuff.c7
-rw-r--r--net/core/sock.c47
-rw-r--r--net/core/xdp.c38
-rw-r--r--net/decnet/dn_nsp_in.c1
-rw-r--r--net/dsa/dsa2.c14
-rw-r--r--net/dsa/slave.c2
-rw-r--r--net/dsa/switch.c22
-rw-r--r--net/ethernet/eth.c12
-rw-r--r--net/ipv4/af_inet.c14
-rw-r--r--net/ipv4/esp4_offload.c4
-rw-r--r--net/ipv4/fou.c20
-rw-r--r--net/ipv4/gre_offload.c8
-rw-r--r--net/ipv4/icmp.c9
-rw-r--r--net/ipv4/inet_fragment.c1
-rw-r--r--net/ipv4/ip_gre.c2
-rw-r--r--net/ipv4/ip_input.c147
-rw-r--r--net/ipv4/ip_output.c22
-rw-r--r--net/ipv4/ipmr.c22
-rw-r--r--net/ipv4/ipmr_base.c1
-rw-r--r--net/ipv4/netfilter/nf_log_ipv4.c8
-rw-r--r--net/ipv4/ping.c10
-rw-r--r--net/ipv4/proc.c2
-rw-r--r--net/ipv4/raw.c11
-rw-r--r--net/ipv4/tcp.c20
-rw-r--r--net/ipv4/tcp_bbr.c6
-rw-r--r--net/ipv4/tcp_input.c48
-rw-r--r--net/ipv4/tcp_ipv4.c3
-rw-r--r--net/ipv4/tcp_minisocks.c229
-rw-r--r--net/ipv4/tcp_offload.c17
-rw-r--r--net/ipv4/tcp_output.c14
-rw-r--r--net/ipv4/tcp_rate.c4
-rw-r--r--net/ipv4/udp.c11
-rw-r--r--net/ipv4/udp_offload.c13
-rw-r--r--net/ipv6/addrconf.c45
-rw-r--r--net/ipv6/af_inet6.c1
-rw-r--r--net/ipv6/datagram.c4
-rw-r--r--net/ipv6/esp6_offload.c4
-rw-r--r--net/ipv6/icmp.c14
-rw-r--r--net/ipv6/ila/Makefile2
-rw-r--r--net/ipv6/ila/ila.h27
-rw-r--r--net/ipv6/ila/ila_common.c30
-rw-r--r--net/ipv6/ila/ila_main.c121
-rw-r--r--net/ipv6/ila/ila_xlat.c291
-rw-r--r--net/ipv6/ip6_flowlabel.c3
-rw-r--r--net/ipv6/ip6_gre.c2
-rw-r--r--net/ipv6/ip6_input.c131
-rw-r--r--net/ipv6/ip6_offload.c16
-rw-r--r--net/ipv6/ip6_output.c38
-rw-r--r--net/ipv6/ip6mr.c1
-rw-r--r--net/ipv6/ipv6_sockglue.c3
-rw-r--r--net/ipv6/netfilter/nf_log_ipv6.c8
-rw-r--r--net/ipv6/ping.c7
-rw-r--r--net/ipv6/raw.c18
-rw-r--r--net/ipv6/seg6.c1
-rw-r--r--net/ipv6/seg6_hmac.c1
-rw-r--r--net/ipv6/tcpv6_offload.c4
-rw-r--r--net/ipv6/udp.c13
-rw-r--r--net/ipv6/udp_offload.c4
-rw-r--r--net/l2tp/l2tp_core.c15
-rw-r--r--net/l2tp/l2tp_core.h12
-rw-r--r--net/l2tp/l2tp_debugfs.c3
-rw-r--r--net/l2tp/l2tp_ip6.c10
-rw-r--r--net/l2tp/l2tp_ppp.c180
-rw-r--r--net/mac80211/Makefile1
-rw-r--r--net/mac80211/agg-rx.c10
-rw-r--r--net/mac80211/agg-tx.c19
-rw-r--r--net/mac80211/cfg.c7
-rw-r--r--net/mac80211/ethtool.c6
-rw-r--r--net/mac80211/he.c55
-rw-r--r--net/mac80211/ht.c2
-rw-r--r--net/mac80211/ieee80211_i.h47
-rw-r--r--net/mac80211/iface.c4
-rw-r--r--net/mac80211/main.c36
-rw-r--r--net/mac80211/mlme.c312
-rw-r--r--net/mac80211/offchannel.c2
-rw-r--r--net/mac80211/rx.c129
-rw-r--r--net/mac80211/scan.c56
-rw-r--r--net/mac80211/sta_info.c101
-rw-r--r--net/mac80211/sta_info.h20
-rw-r--r--net/mac80211/trace.h2
-rw-r--r--net/mac80211/tx.c21
-rw-r--r--net/mac80211/util.c159
-rw-r--r--net/netfilter/core.c15
-rw-r--r--net/netfilter/nf_conntrack_broadcast.c2
-rw-r--r--net/netfilter/nf_conntrack_core.c36
-rw-r--r--net/netfilter/nf_log_common.c5
-rw-r--r--net/netfilter/nf_nat_core.c6
-rw-r--r--net/netfilter/nf_tables_api.c1
-rw-r--r--net/netfilter/nft_meta.c9
-rw-r--r--net/netfilter/nft_socket.c5
-rw-r--r--net/netfilter/xt_cgroup.c6
-rw-r--r--net/netfilter/xt_owner.c2
-rw-r--r--net/netfilter/xt_recent.c3
-rw-r--r--net/netfilter/xt_socket.c8
-rw-r--r--net/openvswitch/actions.c33
-rw-r--r--net/openvswitch/flow_netlink.c80
-rw-r--r--net/packet/af_packet.c16
-rw-r--r--net/rds/ib_recv.c5
-rw-r--r--net/sched/Kconfig22
-rw-r--r--net/sched/Makefile2
-rw-r--r--net/sched/act_api.c424
-rw-r--r--net/sched/act_bpf.c34
-rw-r--r--net/sched/act_connmark.c29
-rw-r--r--net/sched/act_csum.c34
-rw-r--r--net/sched/act_gact.c31
-rw-r--r--net/sched/act_ife.c33
-rw-r--r--net/sched/act_ipt.c44
-rw-r--r--net/sched/act_mirred.c38
-rw-r--r--net/sched/act_nat.c30
-rw-r--r--net/sched/act_pedit.c111
-rw-r--r--net/sched/act_police.c31
-rw-r--r--net/sched/act_sample.c34
-rw-r--r--net/sched/act_simple.c31
-rw-r--r--net/sched/act_skbedit.c175
-rw-r--r--net/sched/act_skbmod.c34
-rw-r--r--net/sched/act_tunnel_key.c295
-rw-r--r--net/sched/act_vlan.c40
-rw-r--r--net/sched/cls_api.c106
-rw-r--r--net/sched/cls_bpf.c39
-rw-r--r--net/sched/cls_flower.c177
-rw-r--r--net/sched/cls_matchall.c32
-rw-r--r--net/sched/cls_u32.c111
-rw-r--r--net/sched/sch_api.c11
-rw-r--r--net/sched/sch_cake.c3019
-rw-r--r--net/sched/sch_etf.c484
-rw-r--r--net/sched/sch_htb.c13
-rw-r--r--net/sched/sch_netem.c73
-rw-r--r--net/sctp/associola.c15
-rw-r--r--net/sctp/input.c1
-rw-r--r--net/sctp/ipv6.c20
-rw-r--r--net/sctp/protocol.c16
-rw-r--r--net/sctp/socket.c245
-rw-r--r--net/smc/Makefile2
-rw-r--r--net/smc/af_smc.c228
-rw-r--r--net/smc/smc.h7
-rw-r--r--net/smc/smc_cdc.c86
-rw-r--r--net/smc/smc_cdc.h43
-rw-r--r--net/smc/smc_clc.c193
-rw-r--r--net/smc/smc_clc.h81
-rw-r--r--net/smc/smc_core.c285
-rw-r--r--net/smc/smc_core.h72
-rw-r--r--net/smc/smc_diag.c18
-rw-r--r--net/smc/smc_ib.c134
-rw-r--r--net/smc/smc_ib.h4
-rw-r--r--net/smc/smc_ism.c314
-rw-r--r--net/smc/smc_ism.h48
-rw-r--r--net/smc/smc_pnet.c157
-rw-r--r--net/smc/smc_pnet.h16
-rw-r--r--net/smc/smc_rx.c2
-rw-r--r--net/smc/smc_tx.c193
-rw-r--r--net/smc/smc_tx.h2
-rw-r--r--net/strparser/strparser.c26
-rw-r--r--net/tipc/bearer.c1
-rw-r--r--net/tipc/group.c37
-rw-r--r--net/tipc/group.h1
-rw-r--r--net/tipc/link.c125
-rw-r--r--net/tipc/link.h2
-rw-r--r--net/tipc/msg.c35
-rw-r--r--net/tipc/node.c90
-rw-r--r--net/tipc/node.h14
-rw-r--r--net/tipc/socket.c5
-rw-r--r--net/tls/tls_device.c301
-rw-r--r--net/tls/tls_device_fallback.c9
-rw-r--r--net/tls/tls_main.c32
-rw-r--r--net/tls/tls_sw.c135
-rw-r--r--net/wireless/core.c21
-rw-r--r--net/wireless/core.h2
-rw-r--r--net/wireless/nl80211.c194
-rw-r--r--net/wireless/sysfs.c4
-rw-r--r--net/wireless/util.c87
-rw-r--r--net/wireless/wext-compat.c10
188 files changed, 12170 insertions, 2181 deletions
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 8ccee3d01822..5e9950453955 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -647,13 +647,14 @@ out:
return err;
}
-static struct sk_buff **vlan_gro_receive(struct sk_buff **head,
- struct sk_buff *skb)
+static struct sk_buff *vlan_gro_receive(struct list_head *head,
+ struct sk_buff *skb)
{
- struct sk_buff *p, **pp = NULL;
- struct vlan_hdr *vhdr;
- unsigned int hlen, off_vlan;
const struct packet_offload *ptype;
+ unsigned int hlen, off_vlan;
+ struct sk_buff *pp = NULL;
+ struct vlan_hdr *vhdr;
+ struct sk_buff *p;
__be16 type;
int flush = 1;
@@ -675,7 +676,7 @@ static struct sk_buff **vlan_gro_receive(struct sk_buff **head,
flush = 0;
- for (p = *head; p; p = p->next) {
+ list_for_each_entry(p, head, list) {
struct vlan_hdr *vhdr2;
if (!NAPI_GRO_CB(p)->same_flow)
diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig
index de8034d80623..361116f77cb9 100644
--- a/net/batman-adv/Kconfig
+++ b/net/batman-adv/Kconfig
@@ -24,7 +24,6 @@ config BATMAN_ADV
depends on NET
select CRC16
select LIBCRC32C
- default n
help
B.A.T.M.A.N. (better approach to mobile ad-hoc networking) is
a routing protocol for multi-hop ad-hoc mesh networks. The
@@ -33,7 +32,7 @@ config BATMAN_ADV
tools.
config BATMAN_ADV_BATMAN_V
- bool "B.A.T.M.A.N. V protocol (experimental)"
+ bool "B.A.T.M.A.N. V protocol"
depends on BATMAN_ADV && !(CFG80211=m && BATMAN_ADV=y)
default y
help
@@ -60,7 +59,7 @@ config BATMAN_ADV_BLA
config BATMAN_ADV_DAT
bool "Distributed ARP Table"
depends on BATMAN_ADV && INET
- default n
+ default y
help
This option enables DAT (Distributed ARP Table), a DHT based
mechanism that increases ARP reliability on sparse wireless
@@ -70,7 +69,6 @@ config BATMAN_ADV_DAT
config BATMAN_ADV_NC
bool "Network Coding"
depends on BATMAN_ADV
- default n
help
This option enables network coding, a mechanism that aims to
increase the overall network throughput by fusing multiple
@@ -84,7 +82,6 @@ config BATMAN_ADV_NC
config BATMAN_ADV_MCAST
bool "Multicast optimisation"
depends on BATMAN_ADV && INET && !(BRIDGE=m && BATMAN_ADV=y)
- default n
help
This option enables the multicast optimisation which aims to
reduce the air overhead while improving the reliability of
@@ -94,7 +91,6 @@ config BATMAN_ADV_DEBUGFS
bool "batman-adv debugfs entries"
depends on BATMAN_ADV
depends on DEBUG_FS
- default n
help
Enable this to export routing related debug tables via debugfs.
The information for each soft-interface and used hard-interface can be
diff --git a/net/batman-adv/bat_iv_ogm.h b/net/batman-adv/bat_iv_ogm.h
index 317cafd302cf..3dc6a7a43eb7 100644
--- a/net/batman-adv/bat_iv_ogm.h
+++ b/net/batman-adv/bat_iv_ogm.h
@@ -16,11 +16,11 @@
* along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
-#ifndef _BATMAN_ADV_BATADV_IV_OGM_H_
-#define _BATMAN_ADV_BATADV_IV_OGM_H_
+#ifndef _NET_BATMAN_ADV_BAT_IV_OGM_H_
+#define _NET_BATMAN_ADV_BAT_IV_OGM_H_
#include "main.h"
int batadv_iv_init(void);
-#endif /* _BATMAN_ADV_BATADV_IV_OGM_H_ */
+#endif /* _NET_BATMAN_ADV_BAT_IV_OGM_H_ */
diff --git a/net/batman-adv/bat_v_ogm.h b/net/batman-adv/bat_v_ogm.h
index ed36c5e79fde..e5be14c908c6 100644
--- a/net/batman-adv/bat_v_ogm.h
+++ b/net/batman-adv/bat_v_ogm.h
@@ -16,8 +16,8 @@
* along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
-#ifndef _BATMAN_ADV_BATADV_V_OGM_H_
-#define _BATMAN_ADV_BATADV_V_OGM_H_
+#ifndef _NET_BATMAN_ADV_BAT_V_OGM_H_
+#define _NET_BATMAN_ADV_BAT_V_OGM_H_
#include "main.h"
@@ -34,4 +34,4 @@ void batadv_v_ogm_primary_iface_set(struct batadv_hard_iface *primary_iface);
int batadv_v_ogm_packet_recv(struct sk_buff *skb,
struct batadv_hard_iface *if_incoming);
-#endif /* _BATMAN_ADV_BATADV_V_OGM_H_ */
+#endif /* _NET_BATMAN_ADV_BAT_V_OGM_H_ */
diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c
index a2de5a44bd41..ff9659af6b91 100644
--- a/net/batman-adv/bridge_loop_avoidance.c
+++ b/net/batman-adv/bridge_loop_avoidance.c
@@ -1449,7 +1449,7 @@ static void batadv_bla_periodic_work(struct work_struct *work)
* detection frames. Set the locally administered bit to avoid
* collisions with users mac addresses.
*/
- random_ether_addr(bat_priv->bla.loopdetect_addr);
+ eth_random_addr(bat_priv->bla.loopdetect_addr);
bat_priv->bla.loopdetect_addr[0] = 0xba;
bat_priv->bla.loopdetect_addr[1] = 0xbe;
bat_priv->bla.loopdetect_lasttime = jiffies;
diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c
index 87479c60670e..3cb82378300b 100644
--- a/net/batman-adv/debugfs.c
+++ b/net/batman-adv/debugfs.c
@@ -118,7 +118,7 @@ static int batadv_bla_backbone_table_open(struct inode *inode,
#ifdef CONFIG_BATMAN_ADV_DAT
/**
- * batadv_dat_cache_open() - Prepare file handler for reads from dat_chache
+ * batadv_dat_cache_open() - Prepare file handler for reads from dat_cache
* @inode: inode which was opened
* @file: file handle to be initialized
*
diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index 716e5b43acfa..1d295da3e342 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -1339,7 +1339,11 @@ static bool batadv_purge_orig_node(struct batadv_priv *bat_priv,
return false;
}
-static void _batadv_purge_orig(struct batadv_priv *bat_priv)
+/**
+ * batadv_purge_orig_ref() - Purge all outdated originators
+ * @bat_priv: the bat priv with all the soft interface information
+ */
+void batadv_purge_orig_ref(struct batadv_priv *bat_priv)
{
struct batadv_hashtable *hash = bat_priv->orig_hash;
struct hlist_node *node_tmp;
@@ -1385,21 +1389,12 @@ static void batadv_purge_orig(struct work_struct *work)
delayed_work = to_delayed_work(work);
bat_priv = container_of(delayed_work, struct batadv_priv, orig_work);
- _batadv_purge_orig(bat_priv);
+ batadv_purge_orig_ref(bat_priv);
queue_delayed_work(batadv_event_workqueue,
&bat_priv->orig_work,
msecs_to_jiffies(BATADV_ORIG_WORK_PERIOD));
}
-/**
- * batadv_purge_orig_ref() - Purge all outdated originators
- * @bat_priv: the bat priv with all the soft interface information
- */
-void batadv_purge_orig_ref(struct batadv_priv *bat_priv)
-{
- _batadv_purge_orig(bat_priv);
-}
-
#ifdef CONFIG_BATMAN_ADV_DEBUGFS
/**
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index 360357f83f20..343d304851a5 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -43,12 +43,13 @@ struct seq_file;
#ifdef CONFIG_BATMAN_ADV_DAT
/**
- * batadv_dat_addr_t - it is the type used for all DHT addresses. If it is
- * changed, BATADV_DAT_ADDR_MAX is changed as well.
+ * typedef batadv_dat_addr_t - type used for all DHT addresses
+ *
+ * If it is changed, BATADV_DAT_ADDR_MAX is changed as well.
*
* *Please be careful: batadv_dat_addr_t must be UNSIGNED*
*/
-#define batadv_dat_addr_t u16
+typedef u16 batadv_dat_addr_t;
#endif /* CONFIG_BATMAN_ADV_DAT */
diff --git a/net/core/dev.c b/net/core/dev.c
index a5aa1c7444e6..4f8b92d81d10 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -149,7 +149,6 @@
#include "net-sysfs.h"
-/* Instead of increasing this, you should create a hash table. */
#define MAX_GRO_SKBS 8
/* This should be increased if a protocol with a bigger head is added. */
@@ -2068,11 +2067,13 @@ int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
int i;
+ /* walk through the TCs and see if it falls into any of them */
for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
if ((txq - tc->offset) < tc->count)
return i;
}
+ /* didn't find it, just return -1 to indicate no match */
return -1;
}
@@ -2081,6 +2082,10 @@ int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
EXPORT_SYMBOL(netdev_txq_to_tc);
#ifdef CONFIG_XPS
+struct static_key xps_needed __read_mostly;
+EXPORT_SYMBOL(xps_needed);
+struct static_key xps_rxqs_needed __read_mostly;
+EXPORT_SYMBOL(xps_rxqs_needed);
static DEFINE_MUTEX(xps_map_mutex);
#define xmap_dereference(P) \
rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
@@ -2092,7 +2097,7 @@ static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
int pos;
if (dev_maps)
- map = xmap_dereference(dev_maps->cpu_map[tci]);
+ map = xmap_dereference(dev_maps->attr_map[tci]);
if (!map)
return false;
@@ -2105,7 +2110,7 @@ static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
break;
}
- RCU_INIT_POINTER(dev_maps->cpu_map[tci], NULL);
+ RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
kfree_rcu(map, rcu);
return false;
}
@@ -2135,33 +2140,68 @@ static bool remove_xps_queue_cpu(struct net_device *dev,
return active;
}
+static void clean_xps_maps(struct net_device *dev, const unsigned long *mask,
+ struct xps_dev_maps *dev_maps, unsigned int nr_ids,
+ u16 offset, u16 count, bool is_rxqs_map)
+{
+ bool active = false;
+ int i, j;
+
+ for (j = -1; j = netif_attrmask_next(j, mask, nr_ids),
+ j < nr_ids;)
+ active |= remove_xps_queue_cpu(dev, dev_maps, j, offset,
+ count);
+ if (!active) {
+ if (is_rxqs_map) {
+ RCU_INIT_POINTER(dev->xps_rxqs_map, NULL);
+ } else {
+ RCU_INIT_POINTER(dev->xps_cpus_map, NULL);
+
+ for (i = offset + (count - 1); count--; i--)
+ netdev_queue_numa_node_write(
+ netdev_get_tx_queue(dev, i),
+ NUMA_NO_NODE);
+ }
+ kfree_rcu(dev_maps, rcu);
+ }
+}
+
static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
u16 count)
{
+ const unsigned long *possible_mask = NULL;
struct xps_dev_maps *dev_maps;
- int cpu, i;
- bool active = false;
+ unsigned int nr_ids;
+
+ if (!static_key_false(&xps_needed))
+ return;
mutex_lock(&xps_map_mutex);
- dev_maps = xmap_dereference(dev->xps_maps);
+ if (static_key_false(&xps_rxqs_needed)) {
+ dev_maps = xmap_dereference(dev->xps_rxqs_map);
+ if (dev_maps) {
+ nr_ids = dev->num_rx_queues;
+ clean_xps_maps(dev, possible_mask, dev_maps, nr_ids,
+ offset, count, true);
+ }
+ }
+
+ dev_maps = xmap_dereference(dev->xps_cpus_map);
if (!dev_maps)
goto out_no_maps;
- for_each_possible_cpu(cpu)
- active |= remove_xps_queue_cpu(dev, dev_maps, cpu,
- offset, count);
-
- if (!active) {
- RCU_INIT_POINTER(dev->xps_maps, NULL);
- kfree_rcu(dev_maps, rcu);
- }
-
- for (i = offset + (count - 1); count--; i--)
- netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
- NUMA_NO_NODE);
+ if (num_possible_cpus() > 1)
+ possible_mask = cpumask_bits(cpu_possible_mask);
+ nr_ids = nr_cpu_ids;
+ clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, offset, count,
+ false);
out_no_maps:
+ if (static_key_enabled(&xps_rxqs_needed))
+ static_key_slow_dec(&xps_rxqs_needed);
+
+ static_key_slow_dec(&xps_needed);
mutex_unlock(&xps_map_mutex);
}
@@ -2170,8 +2210,8 @@ static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
}
-static struct xps_map *expand_xps_map(struct xps_map *map,
- int cpu, u16 index)
+static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
+ u16 index, bool is_rxqs_map)
{
struct xps_map *new_map;
int alloc_len = XPS_MIN_MAP_ALLOC;
@@ -2183,7 +2223,7 @@ static struct xps_map *expand_xps_map(struct xps_map *map,
return map;
}
- /* Need to add queue to this CPU's existing map */
+ /* Need to add tx-queue to this CPU's/rx-queue's existing map */
if (map) {
if (pos < map->alloc_len)
return map;
@@ -2191,9 +2231,14 @@ static struct xps_map *expand_xps_map(struct xps_map *map,
alloc_len = map->alloc_len * 2;
}
- /* Need to allocate new map to store queue on this CPU's map */
- new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
- cpu_to_node(cpu));
+ /* Need to allocate new map to store tx-queue on this CPU's/rx-queue's
+ * map
+ */
+ if (is_rxqs_map)
+ new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL);
+ else
+ new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
+ cpu_to_node(attr_index));
if (!new_map)
return NULL;
@@ -2205,32 +2250,52 @@ static struct xps_map *expand_xps_map(struct xps_map *map,
return new_map;
}
-int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
- u16 index)
+int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
+ u16 index, bool is_rxqs_map)
{
+ const unsigned long *online_mask = NULL, *possible_mask = NULL;
struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
- int i, cpu, tci, numa_node_id = -2;
+ int i, j, tci, numa_node_id = -2;
int maps_sz, num_tc = 1, tc = 0;
struct xps_map *map, *new_map;
bool active = false;
+ unsigned int nr_ids;
if (dev->num_tc) {
+ /* Do not allow XPS on subordinate device directly */
num_tc = dev->num_tc;
+ if (num_tc < 0)
+ return -EINVAL;
+
+ /* If queue belongs to subordinate dev use its map */
+ dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
+
tc = netdev_txq_to_tc(dev, index);
if (tc < 0)
return -EINVAL;
}
- maps_sz = XPS_DEV_MAPS_SIZE(num_tc);
- if (maps_sz < L1_CACHE_BYTES)
- maps_sz = L1_CACHE_BYTES;
-
mutex_lock(&xps_map_mutex);
+ if (is_rxqs_map) {
+ maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
+ dev_maps = xmap_dereference(dev->xps_rxqs_map);
+ nr_ids = dev->num_rx_queues;
+ } else {
+ maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
+ if (num_possible_cpus() > 1) {
+ online_mask = cpumask_bits(cpu_online_mask);
+ possible_mask = cpumask_bits(cpu_possible_mask);
+ }
+ dev_maps = xmap_dereference(dev->xps_cpus_map);
+ nr_ids = nr_cpu_ids;
+ }
- dev_maps = xmap_dereference(dev->xps_maps);
+ if (maps_sz < L1_CACHE_BYTES)
+ maps_sz = L1_CACHE_BYTES;
/* allocate memory for queue storage */
- for_each_cpu_and(cpu, cpu_online_mask, mask) {
+ for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids),
+ j < nr_ids;) {
if (!new_dev_maps)
new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
if (!new_dev_maps) {
@@ -2238,73 +2303,85 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
return -ENOMEM;
}
- tci = cpu * num_tc + tc;
- map = dev_maps ? xmap_dereference(dev_maps->cpu_map[tci]) :
+ tci = j * num_tc + tc;
+ map = dev_maps ? xmap_dereference(dev_maps->attr_map[tci]) :
NULL;
- map = expand_xps_map(map, cpu, index);
+ map = expand_xps_map(map, j, index, is_rxqs_map);
if (!map)
goto error;
- RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
+ RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
}
if (!new_dev_maps)
goto out_no_new_maps;
- for_each_possible_cpu(cpu) {
+ static_key_slow_inc(&xps_needed);
+ if (is_rxqs_map)
+ static_key_slow_inc(&xps_rxqs_needed);
+
+ for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
+ j < nr_ids;) {
/* copy maps belonging to foreign traffic classes */
- for (i = tc, tci = cpu * num_tc; dev_maps && i--; tci++) {
+ for (i = tc, tci = j * num_tc; dev_maps && i--; tci++) {
/* fill in the new device map from the old device map */
- map = xmap_dereference(dev_maps->cpu_map[tci]);
- RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
+ map = xmap_dereference(dev_maps->attr_map[tci]);
+ RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
}
/* We need to explicitly update tci as prevous loop
* could break out early if dev_maps is NULL.
*/
- tci = cpu * num_tc + tc;
+ tci = j * num_tc + tc;
- if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
- /* add queue to CPU maps */
+ if (netif_attr_test_mask(j, mask, nr_ids) &&
+ netif_attr_test_online(j, online_mask, nr_ids)) {
+ /* add tx-queue to CPU/rx-queue maps */
int pos = 0;
- map = xmap_dereference(new_dev_maps->cpu_map[tci]);
+ map = xmap_dereference(new_dev_maps->attr_map[tci]);
while ((pos < map->len) && (map->queues[pos] != index))
pos++;
if (pos == map->len)
map->queues[map->len++] = index;
#ifdef CONFIG_NUMA
- if (numa_node_id == -2)
- numa_node_id = cpu_to_node(cpu);
- else if (numa_node_id != cpu_to_node(cpu))
- numa_node_id = -1;
+ if (!is_rxqs_map) {
+ if (numa_node_id == -2)
+ numa_node_id = cpu_to_node(j);
+ else if (numa_node_id != cpu_to_node(j))
+ numa_node_id = -1;
+ }
#endif
} else if (dev_maps) {
/* fill in the new device map from the old device map */
- map = xmap_dereference(dev_maps->cpu_map[tci]);
- RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
+ map = xmap_dereference(dev_maps->attr_map[tci]);
+ RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
}
/* copy maps belonging to foreign traffic classes */
for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
/* fill in the new device map from the old device map */
- map = xmap_dereference(dev_maps->cpu_map[tci]);
- RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
+ map = xmap_dereference(dev_maps->attr_map[tci]);
+ RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
}
}
- rcu_assign_pointer(dev->xps_maps, new_dev_maps);
+ if (is_rxqs_map)
+ rcu_assign_pointer(dev->xps_rxqs_map, new_dev_maps);
+ else
+ rcu_assign_pointer(dev->xps_cpus_map, new_dev_maps);
/* Cleanup old maps */
if (!dev_maps)
goto out_no_old_maps;
- for_each_possible_cpu(cpu) {
- for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
- new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
- map = xmap_dereference(dev_maps->cpu_map[tci]);
+ for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
+ j < nr_ids;) {
+ for (i = num_tc, tci = j * num_tc; i--; tci++) {
+ new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
+ map = xmap_dereference(dev_maps->attr_map[tci]);
if (map && map != new_map)
kfree_rcu(map, rcu);
}
@@ -2317,19 +2394,23 @@ out_no_old_maps:
active = true;
out_no_new_maps:
- /* update Tx queue numa node */
- netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
- (numa_node_id >= 0) ? numa_node_id :
- NUMA_NO_NODE);
+ if (!is_rxqs_map) {
+ /* update Tx queue numa node */
+ netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
+ (numa_node_id >= 0) ?
+ numa_node_id : NUMA_NO_NODE);
+ }
if (!dev_maps)
goto out_no_maps;
- /* removes queue from unused CPUs */
- for_each_possible_cpu(cpu) {
- for (i = tc, tci = cpu * num_tc; i--; tci++)
+ /* removes tx-queue from unused CPUs/rx-queues */
+ for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
+ j < nr_ids;) {
+ for (i = tc, tci = j * num_tc; i--; tci++)
active |= remove_xps_queue(dev_maps, tci, index);
- if (!cpumask_test_cpu(cpu, mask) || !cpu_online(cpu))
+ if (!netif_attr_test_mask(j, mask, nr_ids) ||
+ !netif_attr_test_online(j, online_mask, nr_ids))
active |= remove_xps_queue(dev_maps, tci, index);
for (i = num_tc - tc, tci++; --i; tci++)
active |= remove_xps_queue(dev_maps, tci, index);
@@ -2337,7 +2418,10 @@ out_no_new_maps:
/* free map if not active */
if (!active) {
- RCU_INIT_POINTER(dev->xps_maps, NULL);
+ if (is_rxqs_map)
+ RCU_INIT_POINTER(dev->xps_rxqs_map, NULL);
+ else
+ RCU_INIT_POINTER(dev->xps_cpus_map, NULL);
kfree_rcu(dev_maps, rcu);
}
@@ -2347,11 +2431,12 @@ out_no_maps:
return 0;
error:
/* remove any maps that we added */
- for_each_possible_cpu(cpu) {
- for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
- new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
+ for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
+ j < nr_ids;) {
+ for (i = num_tc, tci = j * num_tc; i--; tci++) {
+ new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
map = dev_maps ?
- xmap_dereference(dev_maps->cpu_map[tci]) :
+ xmap_dereference(dev_maps->attr_map[tci]) :
NULL;
if (new_map && new_map != map)
kfree(new_map);
@@ -2363,14 +2448,34 @@ error:
kfree(new_dev_maps);
return -ENOMEM;
}
+
+int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
+ u16 index)
+{
+ return __netif_set_xps_queue(dev, cpumask_bits(mask), index, false);
+}
EXPORT_SYMBOL(netif_set_xps_queue);
#endif
+static void netdev_unbind_all_sb_channels(struct net_device *dev)
+{
+ struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
+
+ /* Unbind any subordinate channels */
+ while (txq-- != &dev->_tx[0]) {
+ if (txq->sb_dev)
+ netdev_unbind_sb_channel(dev, txq->sb_dev);
+ }
+}
+
void netdev_reset_tc(struct net_device *dev)
{
#ifdef CONFIG_XPS
netif_reset_xps_queues_gt(dev, 0);
#endif
+ netdev_unbind_all_sb_channels(dev);
+
+ /* Reset TC configuration of device */
dev->num_tc = 0;
memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
@@ -2399,11 +2504,77 @@ int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
#ifdef CONFIG_XPS
netif_reset_xps_queues_gt(dev, 0);
#endif
+ netdev_unbind_all_sb_channels(dev);
+
dev->num_tc = num_tc;
return 0;
}
EXPORT_SYMBOL(netdev_set_num_tc);
+void netdev_unbind_sb_channel(struct net_device *dev,
+ struct net_device *sb_dev)
+{
+ struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
+
+#ifdef CONFIG_XPS
+ netif_reset_xps_queues_gt(sb_dev, 0);
+#endif
+ memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq));
+ memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map));
+
+ while (txq-- != &dev->_tx[0]) {
+ if (txq->sb_dev == sb_dev)
+ txq->sb_dev = NULL;
+ }
+}
+EXPORT_SYMBOL(netdev_unbind_sb_channel);
+
+int netdev_bind_sb_channel_queue(struct net_device *dev,
+ struct net_device *sb_dev,
+ u8 tc, u16 count, u16 offset)
+{
+ /* Make certain the sb_dev and dev are already configured */
+ if (sb_dev->num_tc >= 0 || tc >= dev->num_tc)
+ return -EINVAL;
+
+ /* We cannot hand out queues we don't have */
+ if ((offset + count) > dev->real_num_tx_queues)
+ return -EINVAL;
+
+ /* Record the mapping */
+ sb_dev->tc_to_txq[tc].count = count;
+ sb_dev->tc_to_txq[tc].offset = offset;
+
+ /* Provide a way for Tx queue to find the tc_to_txq map or
+ * XPS map for itself.
+ */
+ while (count--)
+ netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev;
+
+ return 0;
+}
+EXPORT_SYMBOL(netdev_bind_sb_channel_queue);
+
+int netdev_set_sb_channel(struct net_device *dev, u16 channel)
+{
+ /* Do not use a multiqueue device to represent a subordinate channel */
+ if (netif_is_multiqueue(dev))
+ return -ENODEV;
+
+ /* We allow channels 1 - 32767 to be used for subordinate channels.
+ * Channel 0 is meant to be "native" mode and used only to represent
+ * the main root device. We allow writing 0 to reset the device back
+ * to normal mode after being used as a subordinate channel.
+ */
+ if (channel > S16_MAX)
+ return -EINVAL;
+
+ dev->num_tc = -channel;
+
+ return 0;
+}
+EXPORT_SYMBOL(netdev_set_sb_channel);
+
/*
* Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
* greater than real_num_tx_queues stale skbs on the qdisc must be flushed.
@@ -2615,24 +2786,26 @@ EXPORT_SYMBOL(netif_device_attach);
* Returns a Tx hash based on the given packet descriptor a Tx queues' number
* to be used as a distribution range.
*/
-static u16 skb_tx_hash(const struct net_device *dev, struct sk_buff *skb)
+static u16 skb_tx_hash(const struct net_device *dev,
+ const struct net_device *sb_dev,
+ struct sk_buff *skb)
{
u32 hash;
u16 qoffset = 0;
u16 qcount = dev->real_num_tx_queues;
+ if (dev->num_tc) {
+ u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
+
+ qoffset = sb_dev->tc_to_txq[tc].offset;
+ qcount = sb_dev->tc_to_txq[tc].count;
+ }
+
if (skb_rx_queue_recorded(skb)) {
hash = skb_get_rx_queue(skb);
while (unlikely(hash >= qcount))
hash -= qcount;
- return hash;
- }
-
- if (dev->num_tc) {
- u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
-
- qoffset = dev->tc_to_txq[tc].offset;
- qcount = dev->tc_to_txq[tc].count;
+ return hash + qoffset;
}
return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
@@ -3376,32 +3549,64 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
}
#endif /* CONFIG_NET_EGRESS */
-static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
+#ifdef CONFIG_XPS
+static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
+ struct xps_dev_maps *dev_maps, unsigned int tci)
+{
+ struct xps_map *map;
+ int queue_index = -1;
+
+ if (dev->num_tc) {
+ tci *= dev->num_tc;
+ tci += netdev_get_prio_tc_map(dev, skb->priority);
+ }
+
+ map = rcu_dereference(dev_maps->attr_map[tci]);
+ if (map) {
+ if (map->len == 1)
+ queue_index = map->queues[0];
+ else
+ queue_index = map->queues[reciprocal_scale(
+ skb_get_hash(skb), map->len)];
+ if (unlikely(queue_index >= dev->real_num_tx_queues))
+ queue_index = -1;
+ }
+ return queue_index;
+}
+#endif
+
+static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev,
+ struct sk_buff *skb)
{
#ifdef CONFIG_XPS
struct xps_dev_maps *dev_maps;
- struct xps_map *map;
+ struct sock *sk = skb->sk;
int queue_index = -1;
+ if (!static_key_false(&xps_needed))
+ return -1;
+
rcu_read_lock();
- dev_maps = rcu_dereference(dev->xps_maps);
+ if (!static_key_false(&xps_rxqs_needed))
+ goto get_cpus_map;
+
+ dev_maps = rcu_dereference(sb_dev->xps_rxqs_map);
if (dev_maps) {
- unsigned int tci = skb->sender_cpu - 1;
+ int tci = sk_rx_queue_get(sk);
- if (dev->num_tc) {
- tci *= dev->num_tc;
- tci += netdev_get_prio_tc_map(dev, skb->priority);
- }
+ if (tci >= 0 && tci < dev->num_rx_queues)
+ queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
+ tci);
+ }
- map = rcu_dereference(dev_maps->cpu_map[tci]);
- if (map) {
- if (map->len == 1)
- queue_index = map->queues[0];
- else
- queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
- map->len)];
- if (unlikely(queue_index >= dev->real_num_tx_queues))
- queue_index = -1;
+get_cpus_map:
+ if (queue_index < 0) {
+ dev_maps = rcu_dereference(sb_dev->xps_cpus_map);
+ if (dev_maps) {
+ unsigned int tci = skb->sender_cpu - 1;
+
+ queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
+ tci);
}
}
rcu_read_unlock();
@@ -3412,17 +3617,36 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
#endif
}
-static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
+u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
+ struct net_device *sb_dev,
+ select_queue_fallback_t fallback)
+{
+ return 0;
+}
+EXPORT_SYMBOL(dev_pick_tx_zero);
+
+u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
+ struct net_device *sb_dev,
+ select_queue_fallback_t fallback)
+{
+ return (u16)raw_smp_processor_id() % dev->real_num_tx_queues;
+}
+EXPORT_SYMBOL(dev_pick_tx_cpu_id);
+
+static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
+ struct net_device *sb_dev)
{
struct sock *sk = skb->sk;
int queue_index = sk_tx_queue_get(sk);
+ sb_dev = sb_dev ? : dev;
+
if (queue_index < 0 || skb->ooo_okay ||
queue_index >= dev->real_num_tx_queues) {
- int new_index = get_xps_queue(dev, skb);
+ int new_index = get_xps_queue(dev, sb_dev, skb);
if (new_index < 0)
- new_index = skb_tx_hash(dev, skb);
+ new_index = skb_tx_hash(dev, sb_dev, skb);
if (queue_index != new_index && sk &&
sk_fullsock(sk) &&
@@ -3437,7 +3661,7 @@ static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
struct netdev_queue *netdev_pick_tx(struct net_device *dev,
struct sk_buff *skb,
- void *accel_priv)
+ struct net_device *sb_dev)
{
int queue_index = 0;
@@ -3452,10 +3676,10 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev,
const struct net_device_ops *ops = dev->netdev_ops;
if (ops->ndo_select_queue)
- queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
+ queue_index = ops->ndo_select_queue(dev, skb, sb_dev,
__netdev_pick_tx);
else
- queue_index = __netdev_pick_tx(dev, skb);
+ queue_index = __netdev_pick_tx(dev, skb, sb_dev);
queue_index = netdev_cap_txqueue(dev, queue_index);
}
@@ -3467,7 +3691,7 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev,
/**
* __dev_queue_xmit - transmit a buffer
* @skb: buffer to transmit
- * @accel_priv: private data used for L2 forwarding offload
+ * @sb_dev: suboordinate device used for L2 forwarding offload
*
* Queue a buffer for transmission to a network device. The caller must
* have set the device and priority and built the buffer before calling
@@ -3490,7 +3714,7 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev,
* the BH enable code must have IRQs enabled so that it will not deadlock.
* --BLG
*/
-static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
+static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
{
struct net_device *dev = skb->dev;
struct netdev_queue *txq;
@@ -3529,7 +3753,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
else
skb_dst_force(skb);
- txq = netdev_pick_tx(dev, skb, accel_priv);
+ txq = netdev_pick_tx(dev, skb, sb_dev);
q = rcu_dereference_bh(txq->qdisc);
trace_net_dev_queue(skb);
@@ -3603,9 +3827,9 @@ int dev_queue_xmit(struct sk_buff *skb)
}
EXPORT_SYMBOL(dev_queue_xmit);
-int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
+int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev)
{
- return __dev_queue_xmit(skb, accel_priv);
+ return __dev_queue_xmit(skb, sb_dev);
}
EXPORT_SYMBOL(dev_queue_xmit_accel);
@@ -4494,7 +4718,8 @@ static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
return 0;
}
-static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
+static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc,
+ struct packet_type **ppt_prev)
{
struct packet_type *ptype, *pt_prev;
rx_handler_func_t *rx_handler;
@@ -4624,8 +4849,7 @@ skip_classify:
if (pt_prev) {
if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
goto drop;
- else
- ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
+ *ppt_prev = pt_prev;
} else {
drop:
if (!deliver_exact)
@@ -4643,6 +4867,18 @@ out:
return ret;
}
+static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
+{
+ struct net_device *orig_dev = skb->dev;
+ struct packet_type *pt_prev = NULL;
+ int ret;
+
+ ret = __netif_receive_skb_core(skb, pfmemalloc, &pt_prev);
+ if (pt_prev)
+ ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
+ return ret;
+}
+
/**
* netif_receive_skb_core - special purpose version of netif_receive_skb
* @skb: buffer to process
@@ -4663,13 +4899,72 @@ int netif_receive_skb_core(struct sk_buff *skb)
int ret;
rcu_read_lock();
- ret = __netif_receive_skb_core(skb, false);
+ ret = __netif_receive_skb_one_core(skb, false);
rcu_read_unlock();
return ret;
}
EXPORT_SYMBOL(netif_receive_skb_core);
+static inline void __netif_receive_skb_list_ptype(struct list_head *head,
+ struct packet_type *pt_prev,
+ struct net_device *orig_dev)
+{
+ struct sk_buff *skb, *next;
+
+ if (!pt_prev)
+ return;
+ if (list_empty(head))
+ return;
+ if (pt_prev->list_func != NULL)
+ pt_prev->list_func(head, pt_prev, orig_dev);
+ else
+ list_for_each_entry_safe(skb, next, head, list)
+ pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
+}
+
+static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc)
+{
+ /* Fast-path assumptions:
+ * - There is no RX handler.
+ * - Only one packet_type matches.
+ * If either of these fails, we will end up doing some per-packet
+ * processing in-line, then handling the 'last ptype' for the whole
+ * sublist. This can't cause out-of-order delivery to any single ptype,
+ * because the 'last ptype' must be constant across the sublist, and all
+ * other ptypes are handled per-packet.
+ */
+ /* Current (common) ptype of sublist */
+ struct packet_type *pt_curr = NULL;
+ /* Current (common) orig_dev of sublist */
+ struct net_device *od_curr = NULL;
+ struct list_head sublist;
+ struct sk_buff *skb, *next;
+
+ INIT_LIST_HEAD(&sublist);
+ list_for_each_entry_safe(skb, next, head, list) {
+ struct net_device *orig_dev = skb->dev;
+ struct packet_type *pt_prev = NULL;
+
+ list_del(&skb->list);
+ __netif_receive_skb_core(skb, pfmemalloc, &pt_prev);
+ if (!pt_prev)
+ continue;
+ if (pt_curr != pt_prev || od_curr != orig_dev) {
+ /* dispatch old sublist */
+ __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
+ /* start new sublist */
+ INIT_LIST_HEAD(&sublist);
+ pt_curr = pt_prev;
+ od_curr = orig_dev;
+ }
+ list_add_tail(&skb->list, &sublist);
+ }
+
+ /* dispatch final sublist */
+ __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
+}
+
static int __netif_receive_skb(struct sk_buff *skb)
{
int ret;
@@ -4687,14 +4982,44 @@ static int __netif_receive_skb(struct sk_buff *skb)
* context down to all allocation sites.
*/
noreclaim_flag = memalloc_noreclaim_save();
- ret = __netif_receive_skb_core(skb, true);
+ ret = __netif_receive_skb_one_core(skb, true);
memalloc_noreclaim_restore(noreclaim_flag);
} else
- ret = __netif_receive_skb_core(skb, false);
+ ret = __netif_receive_skb_one_core(skb, false);
return ret;
}
+static void __netif_receive_skb_list(struct list_head *head)
+{
+ unsigned long noreclaim_flag = 0;
+ struct sk_buff *skb, *next;
+ bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */
+
+ list_for_each_entry_safe(skb, next, head, list) {
+ if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) {
+ struct list_head sublist;
+
+ /* Handle the previous sublist */
+ list_cut_before(&sublist, head, &skb->list);
+ if (!list_empty(&sublist))
+ __netif_receive_skb_list_core(&sublist, pfmemalloc);
+ pfmemalloc = !pfmemalloc;
+ /* See comments in __netif_receive_skb */
+ if (pfmemalloc)
+ noreclaim_flag = memalloc_noreclaim_save();
+ else
+ memalloc_noreclaim_restore(noreclaim_flag);
+ }
+ }
+ /* Handle the remaining sublist */
+ if (!list_empty(head))
+ __netif_receive_skb_list_core(head, pfmemalloc);
+ /* Restore pflags */
+ if (pfmemalloc)
+ memalloc_noreclaim_restore(noreclaim_flag);
+}
+
static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
{
struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
@@ -4717,7 +5042,6 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
break;
case XDP_QUERY_PROG:
- xdp->prog_attached = !!old;
xdp->prog_id = old ? old->aux->id : 0;
break;
@@ -4769,6 +5093,55 @@ static int netif_receive_skb_internal(struct sk_buff *skb)
return ret;
}
+static void netif_receive_skb_list_internal(struct list_head *head)
+{
+ struct bpf_prog *xdp_prog = NULL;
+ struct sk_buff *skb, *next;
+ struct list_head sublist;
+
+ INIT_LIST_HEAD(&sublist);
+ list_for_each_entry_safe(skb, next, head, list) {
+ net_timestamp_check(netdev_tstamp_prequeue, skb);
+ list_del(&skb->list);
+ if (!skb_defer_rx_timestamp(skb))
+ list_add_tail(&skb->list, &sublist);
+ }
+ list_splice_init(&sublist, head);
+
+ if (static_branch_unlikely(&generic_xdp_needed_key)) {
+ preempt_disable();
+ rcu_read_lock();
+ list_for_each_entry_safe(skb, next, head, list) {
+ xdp_prog = rcu_dereference(skb->dev->xdp_prog);
+ list_del(&skb->list);
+ if (do_xdp_generic(xdp_prog, skb) == XDP_PASS)
+ list_add_tail(&skb->list, &sublist);
+ }
+ rcu_read_unlock();
+ preempt_enable();
+ /* Put passed packets back on main list */
+ list_splice_init(&sublist, head);
+ }
+
+ rcu_read_lock();
+#ifdef CONFIG_RPS
+ if (static_key_false(&rps_needed)) {
+ list_for_each_entry_safe(skb, next, head, list) {
+ struct rps_dev_flow voidflow, *rflow = &voidflow;
+ int cpu = get_rps_cpu(skb->dev, skb, &rflow);
+
+ if (cpu >= 0) {
+ /* Will be handled, remove from list */
+ list_del(&skb->list);
+ enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
+ }
+ }
+ }
+#endif
+ __netif_receive_skb_list(head);
+ rcu_read_unlock();
+}
+
/**
* netif_receive_skb - process receive buffer from network
* @skb: buffer to process
@@ -4792,6 +5165,28 @@ int netif_receive_skb(struct sk_buff *skb)
}
EXPORT_SYMBOL(netif_receive_skb);
+/**
+ * netif_receive_skb_list - process many receive buffers from network
+ * @head: list of skbs to process.
+ *
+ * Since return value of netif_receive_skb() is normally ignored, and
+ * wouldn't be meaningful for a list, this function returns void.
+ *
+ * This function may only be called from softirq context and interrupts
+ * should be enabled.
+ */
+void netif_receive_skb_list(struct list_head *head)
+{
+ struct sk_buff *skb;
+
+ if (list_empty(head))
+ return;
+ list_for_each_entry(skb, head, list)
+ trace_netif_receive_skb_list_entry(skb);
+ netif_receive_skb_list_internal(head);
+}
+EXPORT_SYMBOL(netif_receive_skb_list);
+
DEFINE_PER_CPU(struct work_struct, flush_works);
/* Network device is going away, flush any packets still pending */
@@ -4875,42 +5270,50 @@ out:
return netif_receive_skb_internal(skb);
}
-/* napi->gro_list contains packets ordered by age.
- * youngest packets at the head of it.
- * Complete skbs in reverse order to reduce latencies.
- */
-void napi_gro_flush(struct napi_struct *napi, bool flush_old)
+static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index,
+ bool flush_old)
{
- struct sk_buff *skb, *prev = NULL;
-
- /* scan list and build reverse chain */
- for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
- skb->prev = prev;
- prev = skb;
- }
-
- for (skb = prev; skb; skb = prev) {
- skb->next = NULL;
+ struct list_head *head = &napi->gro_hash[index].list;
+ struct sk_buff *skb, *p;
+ list_for_each_entry_safe_reverse(skb, p, head, list) {
if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
return;
-
- prev = skb->prev;
+ list_del(&skb->list);
+ skb->next = NULL;
napi_gro_complete(skb);
- napi->gro_count--;
+ napi->gro_hash[index].count--;
}
- napi->gro_list = NULL;
+ if (!napi->gro_hash[index].count)
+ __clear_bit(index, &napi->gro_bitmask);
+}
+
+/* napi->gro_hash[].list contains packets ordered by age.
+ * youngest packets at the head of it.
+ * Complete skbs in reverse order to reduce latencies.
+ */
+void napi_gro_flush(struct napi_struct *napi, bool flush_old)
+{
+ u32 i;
+
+ for (i = 0; i < GRO_HASH_BUCKETS; i++) {
+ if (test_bit(i, &napi->gro_bitmask))
+ __napi_gro_flush_chain(napi, i, flush_old);
+ }
}
EXPORT_SYMBOL(napi_gro_flush);
-static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
+static struct list_head *gro_list_prepare(struct napi_struct *napi,
+ struct sk_buff *skb)
{
- struct sk_buff *p;
unsigned int maclen = skb->dev->hard_header_len;
u32 hash = skb_get_hash_raw(skb);
+ struct list_head *head;
+ struct sk_buff *p;
- for (p = napi->gro_list; p; p = p->next) {
+ head = &napi->gro_hash[hash & (GRO_HASH_BUCKETS - 1)].list;
+ list_for_each_entry(p, head, list) {
unsigned long diffs;
NAPI_GRO_CB(p)->flush = 0;
@@ -4933,6 +5336,8 @@ static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
maclen);
NAPI_GRO_CB(p)->same_flow = !diffs;
}
+
+ return head;
}
static void skb_gro_reset_offset(struct sk_buff *skb)
@@ -4975,20 +5380,41 @@ static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
}
}
+static void gro_flush_oldest(struct list_head *head)
+{
+ struct sk_buff *oldest;
+
+ oldest = list_last_entry(head, struct sk_buff, list);
+
+ /* We are called with head length >= MAX_GRO_SKBS, so this is
+ * impossible.
+ */
+ if (WARN_ON_ONCE(!oldest))
+ return;
+
+ /* Do not adjust napi->gro_hash[].count, caller is adding a new
+ * SKB to the chain.
+ */
+ list_del(&oldest->list);
+ napi_gro_complete(oldest);
+}
+
static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
{
- struct sk_buff **pp = NULL;
+ u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
+ struct list_head *head = &offload_base;
struct packet_offload *ptype;
__be16 type = skb->protocol;
- struct list_head *head = &offload_base;
- int same_flow;
+ struct list_head *gro_head;
+ struct sk_buff *pp = NULL;
enum gro_result ret;
+ int same_flow;
int grow;
if (netif_elide_gro(skb->dev))
goto normal;
- gro_list_prepare(napi, skb);
+ gro_head = gro_list_prepare(napi, skb);
rcu_read_lock();
list_for_each_entry_rcu(ptype, head, list) {
@@ -5022,7 +5448,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
NAPI_GRO_CB(skb)->csum_valid = 0;
}
- pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
+ pp = ptype->callbacks.gro_receive(gro_head, skb);
break;
}
rcu_read_unlock();
@@ -5039,12 +5465,10 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
if (pp) {
- struct sk_buff *nskb = *pp;
-
- *pp = nskb->next;
- nskb->next = NULL;
- napi_gro_complete(nskb);
- napi->gro_count--;
+ list_del(&pp->list);
+ pp->next = NULL;
+ napi_gro_complete(pp);
+ napi->gro_hash[hash].count--;
}
if (same_flow)
@@ -5053,26 +5477,16 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
if (NAPI_GRO_CB(skb)->flush)
goto normal;
- if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
- struct sk_buff *nskb = napi->gro_list;
-
- /* locate the end of the list to select the 'oldest' flow */
- while (nskb->next) {
- pp = &nskb->next;
- nskb = *pp;
- }
- *pp = NULL;
- nskb->next = NULL;
- napi_gro_complete(nskb);
+ if (unlikely(napi->gro_hash[hash].count >= MAX_GRO_SKBS)) {
+ gro_flush_oldest(gro_head);
} else {
- napi->gro_count++;
+ napi->gro_hash[hash].count++;
}
NAPI_GRO_CB(skb)->count = 1;
NAPI_GRO_CB(skb)->age = jiffies;
NAPI_GRO_CB(skb)->last = skb;
skb_shinfo(skb)->gso_size = skb_gro_len(skb);
- skb->next = napi->gro_list;
- napi->gro_list = skb;
+ list_add(&skb->list, gro_head);
ret = GRO_HELD;
pull:
@@ -5080,6 +5494,13 @@ pull:
if (grow > 0)
gro_pull_from_frag0(skb, grow);
ok:
+ if (napi->gro_hash[hash].count) {
+ if (!test_bit(hash, &napi->gro_bitmask))
+ __set_bit(hash, &napi->gro_bitmask);
+ } else if (test_bit(hash, &napi->gro_bitmask)) {
+ __clear_bit(hash, &napi->gro_bitmask);
+ }
+
return ret;
normal:
@@ -5478,7 +5899,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
NAPIF_STATE_IN_BUSY_POLL)))
return false;
- if (n->gro_list) {
+ if (n->gro_bitmask) {
unsigned long timeout = 0;
if (work_done)
@@ -5687,7 +6108,7 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
/* Note : we use a relaxed variant of napi_schedule_prep() not setting
* NAPI_STATE_MISSED, since we do not react to a device IRQ.
*/
- if (napi->gro_list && !napi_disable_pending(napi) &&
+ if (napi->gro_bitmask && !napi_disable_pending(napi) &&
!test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
__napi_schedule_irqoff(napi);
@@ -5697,11 +6118,16 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
int (*poll)(struct napi_struct *, int), int weight)
{
+ int i;
+
INIT_LIST_HEAD(&napi->poll_list);
hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
napi->timer.function = napi_watchdog;
- napi->gro_count = 0;
- napi->gro_list = NULL;
+ napi->gro_bitmask = 0;
+ for (i = 0; i < GRO_HASH_BUCKETS; i++) {
+ INIT_LIST_HEAD(&napi->gro_hash[i].list);
+ napi->gro_hash[i].count = 0;
+ }
napi->skb = NULL;
napi->poll = poll;
if (weight > NAPI_POLL_WEIGHT)
@@ -5734,6 +6160,19 @@ void napi_disable(struct napi_struct *n)
}
EXPORT_SYMBOL(napi_disable);
+static void flush_gro_hash(struct napi_struct *napi)
+{
+ int i;
+
+ for (i = 0; i < GRO_HASH_BUCKETS; i++) {
+ struct sk_buff *skb, *n;
+
+ list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list)
+ kfree_skb(skb);
+ napi->gro_hash[i].count = 0;
+ }
+}
+
/* Must be called in process context */
void netif_napi_del(struct napi_struct *napi)
{
@@ -5743,9 +6182,8 @@ void netif_napi_del(struct napi_struct *napi)
list_del_init(&napi->dev_list);
napi_free_frags(napi);
- kfree_skb_list(napi->gro_list);
- napi->gro_list = NULL;
- napi->gro_count = 0;
+ flush_gro_hash(napi);
+ napi->gro_bitmask = 0;
}
EXPORT_SYMBOL(netif_napi_del);
@@ -5787,7 +6225,7 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll)
goto out_unlock;
}
- if (n->gro_list) {
+ if (n->gro_bitmask) {
/* flush too old packets
* If HZ < 1000, flush all packets.
*/
@@ -7276,23 +7714,21 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down)
}
EXPORT_SYMBOL(dev_change_proto_down);
-void __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op,
- struct netdev_bpf *xdp)
+u32 __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op,
+ enum bpf_netdev_command cmd)
{
- memset(xdp, 0, sizeof(*xdp));
- xdp->command = XDP_QUERY_PROG;
+ struct netdev_bpf xdp;
- /* Query must always succeed. */
- WARN_ON(bpf_op(dev, xdp) < 0);
-}
+ if (!bpf_op)
+ return 0;
-static u8 __dev_xdp_attached(struct net_device *dev, bpf_op_t bpf_op)
-{
- struct netdev_bpf xdp;
+ memset(&xdp, 0, sizeof(xdp));
+ xdp.command = cmd;
- __dev_xdp_query(dev, bpf_op, &xdp);
+ /* Query must always succeed. */
+ WARN_ON(bpf_op(dev, &xdp) < 0 && cmd == XDP_QUERY_PROG);
- return xdp.prog_attached;
+ return xdp.prog_id;
}
static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op,
@@ -7326,12 +7762,19 @@ static void dev_xdp_uninstall(struct net_device *dev)
if (!ndo_bpf)
return;
- __dev_xdp_query(dev, ndo_bpf, &xdp);
- if (xdp.prog_attached == XDP_ATTACHED_NONE)
- return;
+ memset(&xdp, 0, sizeof(xdp));
+ xdp.command = XDP_QUERY_PROG;
+ WARN_ON(ndo_bpf(dev, &xdp));
+ if (xdp.prog_id)
+ WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags,
+ NULL));
- /* Program removal should always succeed */
- WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags, NULL));
+ /* Remove HW offload */
+ memset(&xdp, 0, sizeof(xdp));
+ xdp.command = XDP_QUERY_PROG_HW;
+ if (!ndo_bpf(dev, &xdp) && xdp.prog_id)
+ WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags,
+ NULL));
}
/**
@@ -7347,12 +7790,15 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
int fd, u32 flags)
{
const struct net_device_ops *ops = dev->netdev_ops;
+ enum bpf_netdev_command query;
struct bpf_prog *prog = NULL;
bpf_op_t bpf_op, bpf_chk;
int err;
ASSERT_RTNL();
+ query = flags & XDP_FLAGS_HW_MODE ? XDP_QUERY_PROG_HW : XDP_QUERY_PROG;
+
bpf_op = bpf_chk = ops->ndo_bpf;
if (!bpf_op && (flags & (XDP_FLAGS_DRV_MODE | XDP_FLAGS_HW_MODE)))
return -EOPNOTSUPP;
@@ -7362,10 +7808,11 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
bpf_chk = generic_xdp_install;
if (fd >= 0) {
- if (bpf_chk && __dev_xdp_attached(dev, bpf_chk))
+ if (__dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG) ||
+ __dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG_HW))
return -EEXIST;
if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) &&
- __dev_xdp_attached(dev, bpf_op))
+ __dev_xdp_query(dev, bpf_op, query))
return -EBUSY;
prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
@@ -8834,6 +9281,9 @@ static struct hlist_head * __net_init netdev_create_hash(void)
/* Initialize per network namespace state */
static int __net_init netdev_init(struct net *net)
{
+ BUILD_BUG_ON(GRO_HASH_BUCKETS >
+ 8 * FIELD_SIZEOF(struct napi_struct, gro_bitmask));
+
if (net != &init_net)
INIT_LIST_HEAD(&net->dev_base_head);
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 22099705cc41..65fc366a78a4 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -326,6 +326,57 @@ devlink_sb_tc_index_get_from_info(struct devlink_sb *devlink_sb,
pool_type, p_tc_index);
}
+struct devlink_region {
+ struct devlink *devlink;
+ struct list_head list;
+ const char *name;
+ struct list_head snapshot_list;
+ u32 max_snapshots;
+ u32 cur_snapshots;
+ u64 size;
+};
+
+struct devlink_snapshot {
+ struct list_head list;
+ struct devlink_region *region;
+ devlink_snapshot_data_dest_t *data_destructor;
+ u64 data_len;
+ u8 *data;
+ u32 id;
+};
+
+static struct devlink_region *
+devlink_region_get_by_name(struct devlink *devlink, const char *region_name)
+{
+ struct devlink_region *region;
+
+ list_for_each_entry(region, &devlink->region_list, list)
+ if (!strcmp(region->name, region_name))
+ return region;
+
+ return NULL;
+}
+
+static struct devlink_snapshot *
+devlink_region_snapshot_get_by_id(struct devlink_region *region, u32 id)
+{
+ struct devlink_snapshot *snapshot;
+
+ list_for_each_entry(snapshot, &region->snapshot_list, list)
+ if (snapshot->id == id)
+ return snapshot;
+
+ return NULL;
+}
+
+static void devlink_region_snapshot_del(struct devlink_snapshot *snapshot)
+{
+ snapshot->region->cur_snapshots--;
+ list_del(&snapshot->list);
+ (*snapshot->data_destructor)(snapshot->data);
+ kfree(snapshot);
+}
+
#define DEVLINK_NL_FLAG_NEED_DEVLINK BIT(0)
#define DEVLINK_NL_FLAG_NEED_PORT BIT(1)
#define DEVLINK_NL_FLAG_NEED_SB BIT(2)
@@ -2604,6 +2655,919 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info)
return devlink->ops->reload(devlink, info->extack);
}
+static const struct devlink_param devlink_param_generic[] = {
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_INT_ERR_RESET,
+ .name = DEVLINK_PARAM_GENERIC_INT_ERR_RESET_NAME,
+ .type = DEVLINK_PARAM_GENERIC_INT_ERR_RESET_TYPE,
+ },
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_MAX_MACS,
+ .name = DEVLINK_PARAM_GENERIC_MAX_MACS_NAME,
+ .type = DEVLINK_PARAM_GENERIC_MAX_MACS_TYPE,
+ },
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_SRIOV,
+ .name = DEVLINK_PARAM_GENERIC_ENABLE_SRIOV_NAME,
+ .type = DEVLINK_PARAM_GENERIC_ENABLE_SRIOV_TYPE,
+ },
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_REGION_SNAPSHOT,
+ .name = DEVLINK_PARAM_GENERIC_REGION_SNAPSHOT_NAME,
+ .type = DEVLINK_PARAM_GENERIC_REGION_SNAPSHOT_TYPE,
+ },
+};
+
+static int devlink_param_generic_verify(const struct devlink_param *param)
+{
+ /* verify it match generic parameter by id and name */
+ if (param->id > DEVLINK_PARAM_GENERIC_ID_MAX)
+ return -EINVAL;
+ if (strcmp(param->name, devlink_param_generic[param->id].name))
+ return -ENOENT;
+
+ WARN_ON(param->type != devlink_param_generic[param->id].type);
+
+ return 0;
+}
+
+static int devlink_param_driver_verify(const struct devlink_param *param)
+{
+ int i;
+
+ if (param->id <= DEVLINK_PARAM_GENERIC_ID_MAX)
+ return -EINVAL;
+ /* verify no such name in generic params */
+ for (i = 0; i <= DEVLINK_PARAM_GENERIC_ID_MAX; i++)
+ if (!strcmp(param->name, devlink_param_generic[i].name))
+ return -EEXIST;
+
+ return 0;
+}
+
+static struct devlink_param_item *
+devlink_param_find_by_name(struct list_head *param_list,
+ const char *param_name)
+{
+ struct devlink_param_item *param_item;
+
+ list_for_each_entry(param_item, param_list, list)
+ if (!strcmp(param_item->param->name, param_name))
+ return param_item;
+ return NULL;
+}
+
+static struct devlink_param_item *
+devlink_param_find_by_id(struct list_head *param_list, u32 param_id)
+{
+ struct devlink_param_item *param_item;
+
+ list_for_each_entry(param_item, param_list, list)
+ if (param_item->param->id == param_id)
+ return param_item;
+ return NULL;
+}
+
+static bool
+devlink_param_cmode_is_supported(const struct devlink_param *param,
+ enum devlink_param_cmode cmode)
+{
+ return test_bit(cmode, &param->supported_cmodes);
+}
+
+static int devlink_param_get(struct devlink *devlink,
+ const struct devlink_param *param,
+ struct devlink_param_gset_ctx *ctx)
+{
+ if (!param->get)
+ return -EOPNOTSUPP;
+ return param->get(devlink, param->id, ctx);
+}
+
+static int devlink_param_set(struct devlink *devlink,
+ const struct devlink_param *param,
+ struct devlink_param_gset_ctx *ctx)
+{
+ if (!param->set)
+ return -EOPNOTSUPP;
+ return param->set(devlink, param->id, ctx);
+}
+
+static int
+devlink_param_type_to_nla_type(enum devlink_param_type param_type)
+{
+ switch (param_type) {
+ case DEVLINK_PARAM_TYPE_U8:
+ return NLA_U8;
+ case DEVLINK_PARAM_TYPE_U16:
+ return NLA_U16;
+ case DEVLINK_PARAM_TYPE_U32:
+ return NLA_U32;
+ case DEVLINK_PARAM_TYPE_STRING:
+ return NLA_STRING;
+ case DEVLINK_PARAM_TYPE_BOOL:
+ return NLA_FLAG;
+ default:
+ return -EINVAL;
+ }
+}
+
+static int
+devlink_nl_param_value_fill_one(struct sk_buff *msg,
+ enum devlink_param_type type,
+ enum devlink_param_cmode cmode,
+ union devlink_param_value val)
+{
+ struct nlattr *param_value_attr;
+
+ param_value_attr = nla_nest_start(msg, DEVLINK_ATTR_PARAM_VALUE);
+ if (!param_value_attr)
+ goto nla_put_failure;
+
+ if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_VALUE_CMODE, cmode))
+ goto value_nest_cancel;
+
+ switch (type) {
+ case DEVLINK_PARAM_TYPE_U8:
+ if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, val.vu8))
+ goto value_nest_cancel;
+ break;
+ case DEVLINK_PARAM_TYPE_U16:
+ if (nla_put_u16(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, val.vu16))
+ goto value_nest_cancel;
+ break;
+ case DEVLINK_PARAM_TYPE_U32:
+ if (nla_put_u32(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, val.vu32))
+ goto value_nest_cancel;
+ break;
+ case DEVLINK_PARAM_TYPE_STRING:
+ if (nla_put_string(msg, DEVLINK_ATTR_PARAM_VALUE_DATA,
+ val.vstr))
+ goto value_nest_cancel;
+ break;
+ case DEVLINK_PARAM_TYPE_BOOL:
+ if (val.vbool &&
+ nla_put_flag(msg, DEVLINK_ATTR_PARAM_VALUE_DATA))
+ goto value_nest_cancel;
+ break;
+ }
+
+ nla_nest_end(msg, param_value_attr);
+ return 0;
+
+value_nest_cancel:
+ nla_nest_cancel(msg, param_value_attr);
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static int devlink_nl_param_fill(struct sk_buff *msg, struct devlink *devlink,
+ struct devlink_param_item *param_item,
+ enum devlink_command cmd,
+ u32 portid, u32 seq, int flags)
+{
+ union devlink_param_value param_value[DEVLINK_PARAM_CMODE_MAX + 1];
+ const struct devlink_param *param = param_item->param;
+ struct devlink_param_gset_ctx ctx;
+ struct nlattr *param_values_list;
+ struct nlattr *param_attr;
+ int nla_type;
+ void *hdr;
+ int err;
+ int i;
+
+ /* Get value from driver part to driverinit configuration mode */
+ for (i = 0; i <= DEVLINK_PARAM_CMODE_MAX; i++) {
+ if (!devlink_param_cmode_is_supported(param, i))
+ continue;
+ if (i == DEVLINK_PARAM_CMODE_DRIVERINIT) {
+ if (!param_item->driverinit_value_valid)
+ return -EOPNOTSUPP;
+ param_value[i] = param_item->driverinit_value;
+ } else {
+ ctx.cmode = i;
+ err = devlink_param_get(devlink, param, &ctx);
+ if (err)
+ return err;
+ param_value[i] = ctx.val;
+ }
+ }
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto genlmsg_cancel;
+ param_attr = nla_nest_start(msg, DEVLINK_ATTR_PARAM);
+ if (!param_attr)
+ goto genlmsg_cancel;
+ if (nla_put_string(msg, DEVLINK_ATTR_PARAM_NAME, param->name))
+ goto param_nest_cancel;
+ if (param->generic && nla_put_flag(msg, DEVLINK_ATTR_PARAM_GENERIC))
+ goto param_nest_cancel;
+
+ nla_type = devlink_param_type_to_nla_type(param->type);
+ if (nla_type < 0)
+ goto param_nest_cancel;
+ if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_TYPE, nla_type))
+ goto param_nest_cancel;
+
+ param_values_list = nla_nest_start(msg, DEVLINK_ATTR_PARAM_VALUES_LIST);
+ if (!param_values_list)
+ goto param_nest_cancel;
+
+ for (i = 0; i <= DEVLINK_PARAM_CMODE_MAX; i++) {
+ if (!devlink_param_cmode_is_supported(param, i))
+ continue;
+ err = devlink_nl_param_value_fill_one(msg, param->type,
+ i, param_value[i]);
+ if (err)
+ goto values_list_nest_cancel;
+ }
+
+ nla_nest_end(msg, param_values_list);
+ nla_nest_end(msg, param_attr);
+ genlmsg_end(msg, hdr);
+ return 0;
+
+values_list_nest_cancel:
+ nla_nest_end(msg, param_values_list);
+param_nest_cancel:
+ nla_nest_cancel(msg, param_attr);
+genlmsg_cancel:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static void devlink_param_notify(struct devlink *devlink,
+ struct devlink_param_item *param_item,
+ enum devlink_command cmd)
+{
+ struct sk_buff *msg;
+ int err;
+
+ WARN_ON(cmd != DEVLINK_CMD_PARAM_NEW && cmd != DEVLINK_CMD_PARAM_DEL);
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return;
+ err = devlink_nl_param_fill(msg, devlink, param_item, cmd, 0, 0, 0);
+ if (err) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink),
+ msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
+}
+
+static int devlink_nl_cmd_param_get_dumpit(struct sk_buff *msg,
+ struct netlink_callback *cb)
+{
+ struct devlink_param_item *param_item;
+ struct devlink *devlink;
+ int start = cb->args[0];
+ int idx = 0;
+ int err;
+
+ mutex_lock(&devlink_mutex);
+ list_for_each_entry(devlink, &devlink_list, list) {
+ if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+ continue;
+ mutex_lock(&devlink->lock);
+ list_for_each_entry(param_item, &devlink->param_list, list) {
+ if (idx < start) {
+ idx++;
+ continue;
+ }
+ err = devlink_nl_param_fill(msg, devlink, param_item,
+ DEVLINK_CMD_PARAM_GET,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NLM_F_MULTI);
+ if (err) {
+ mutex_unlock(&devlink->lock);
+ goto out;
+ }
+ idx++;
+ }
+ mutex_unlock(&devlink->lock);
+ }
+out:
+ mutex_unlock(&devlink_mutex);
+
+ cb->args[0] = idx;
+ return msg->len;
+}
+
+static int
+devlink_param_type_get_from_info(struct genl_info *info,
+ enum devlink_param_type *param_type)
+{
+ if (!info->attrs[DEVLINK_ATTR_PARAM_TYPE])
+ return -EINVAL;
+
+ switch (nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_TYPE])) {
+ case NLA_U8:
+ *param_type = DEVLINK_PARAM_TYPE_U8;
+ break;
+ case NLA_U16:
+ *param_type = DEVLINK_PARAM_TYPE_U16;
+ break;
+ case NLA_U32:
+ *param_type = DEVLINK_PARAM_TYPE_U32;
+ break;
+ case NLA_STRING:
+ *param_type = DEVLINK_PARAM_TYPE_STRING;
+ break;
+ case NLA_FLAG:
+ *param_type = DEVLINK_PARAM_TYPE_BOOL;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int
+devlink_param_value_get_from_info(const struct devlink_param *param,
+ struct genl_info *info,
+ union devlink_param_value *value)
+{
+ if (param->type != DEVLINK_PARAM_TYPE_BOOL &&
+ !info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA])
+ return -EINVAL;
+
+ switch (param->type) {
+ case DEVLINK_PARAM_TYPE_U8:
+ value->vu8 = nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]);
+ break;
+ case DEVLINK_PARAM_TYPE_U16:
+ value->vu16 = nla_get_u16(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]);
+ break;
+ case DEVLINK_PARAM_TYPE_U32:
+ value->vu32 = nla_get_u32(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]);
+ break;
+ case DEVLINK_PARAM_TYPE_STRING:
+ if (nla_len(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]) >
+ DEVLINK_PARAM_MAX_STRING_VALUE)
+ return -EINVAL;
+ value->vstr = nla_data(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]);
+ break;
+ case DEVLINK_PARAM_TYPE_BOOL:
+ value->vbool = info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA] ?
+ true : false;
+ break;
+ }
+ return 0;
+}
+
+static struct devlink_param_item *
+devlink_param_get_from_info(struct devlink *devlink,
+ struct genl_info *info)
+{
+ char *param_name;
+
+ if (!info->attrs[DEVLINK_ATTR_PARAM_NAME])
+ return NULL;
+
+ param_name = nla_data(info->attrs[DEVLINK_ATTR_PARAM_NAME]);
+ return devlink_param_find_by_name(&devlink->param_list, param_name);
+}
+
+static int devlink_nl_cmd_param_get_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_param_item *param_item;
+ struct sk_buff *msg;
+ int err;
+
+ param_item = devlink_param_get_from_info(devlink, info);
+ if (!param_item)
+ return -EINVAL;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_param_fill(msg, devlink, param_item,
+ DEVLINK_CMD_PARAM_GET,
+ info->snd_portid, info->snd_seq, 0);
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static int devlink_nl_cmd_param_set_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ enum devlink_param_type param_type;
+ struct devlink_param_gset_ctx ctx;
+ enum devlink_param_cmode cmode;
+ struct devlink_param_item *param_item;
+ const struct devlink_param *param;
+ union devlink_param_value value;
+ int err = 0;
+
+ param_item = devlink_param_get_from_info(devlink, info);
+ if (!param_item)
+ return -EINVAL;
+ param = param_item->param;
+ err = devlink_param_type_get_from_info(info, &param_type);
+ if (err)
+ return err;
+ if (param_type != param->type)
+ return -EINVAL;
+ err = devlink_param_value_get_from_info(param, info, &value);
+ if (err)
+ return err;
+ if (param->validate) {
+ err = param->validate(devlink, param->id, value, info->extack);
+ if (err)
+ return err;
+ }
+
+ if (!info->attrs[DEVLINK_ATTR_PARAM_VALUE_CMODE])
+ return -EINVAL;
+ cmode = nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_VALUE_CMODE]);
+ if (!devlink_param_cmode_is_supported(param, cmode))
+ return -EOPNOTSUPP;
+
+ if (cmode == DEVLINK_PARAM_CMODE_DRIVERINIT) {
+ param_item->driverinit_value = value;
+ param_item->driverinit_value_valid = true;
+ } else {
+ if (!param->set)
+ return -EOPNOTSUPP;
+ ctx.val = value;
+ ctx.cmode = cmode;
+ err = devlink_param_set(devlink, param, &ctx);
+ if (err)
+ return err;
+ }
+
+ devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_NEW);
+ return 0;
+}
+
+static int devlink_param_register_one(struct devlink *devlink,
+ const struct devlink_param *param)
+{
+ struct devlink_param_item *param_item;
+
+ if (devlink_param_find_by_name(&devlink->param_list,
+ param->name))
+ return -EEXIST;
+
+ if (param->supported_cmodes == BIT(DEVLINK_PARAM_CMODE_DRIVERINIT))
+ WARN_ON(param->get || param->set);
+ else
+ WARN_ON(!param->get || !param->set);
+
+ param_item = kzalloc(sizeof(*param_item), GFP_KERNEL);
+ if (!param_item)
+ return -ENOMEM;
+ param_item->param = param;
+
+ list_add_tail(&param_item->list, &devlink->param_list);
+ devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_NEW);
+ return 0;
+}
+
+static void devlink_param_unregister_one(struct devlink *devlink,
+ const struct devlink_param *param)
+{
+ struct devlink_param_item *param_item;
+
+ param_item = devlink_param_find_by_name(&devlink->param_list,
+ param->name);
+ WARN_ON(!param_item);
+ devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_DEL);
+ list_del(&param_item->list);
+ kfree(param_item);
+}
+
+static int devlink_nl_region_snapshot_id_put(struct sk_buff *msg,
+ struct devlink *devlink,
+ struct devlink_snapshot *snapshot)
+{
+ struct nlattr *snap_attr;
+ int err;
+
+ snap_attr = nla_nest_start(msg, DEVLINK_ATTR_REGION_SNAPSHOT);
+ if (!snap_attr)
+ return -EINVAL;
+
+ err = nla_put_u32(msg, DEVLINK_ATTR_REGION_SNAPSHOT_ID, snapshot->id);
+ if (err)
+ goto nla_put_failure;
+
+ nla_nest_end(msg, snap_attr);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(msg, snap_attr);
+ return err;
+}
+
+static int devlink_nl_region_snapshots_id_put(struct sk_buff *msg,
+ struct devlink *devlink,
+ struct devlink_region *region)
+{
+ struct devlink_snapshot *snapshot;
+ struct nlattr *snapshots_attr;
+ int err;
+
+ snapshots_attr = nla_nest_start(msg, DEVLINK_ATTR_REGION_SNAPSHOTS);
+ if (!snapshots_attr)
+ return -EINVAL;
+
+ list_for_each_entry(snapshot, &region->snapshot_list, list) {
+ err = devlink_nl_region_snapshot_id_put(msg, devlink, snapshot);
+ if (err)
+ goto nla_put_failure;
+ }
+
+ nla_nest_end(msg, snapshots_attr);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(msg, snapshots_attr);
+ return err;
+}
+
+static int devlink_nl_region_fill(struct sk_buff *msg, struct devlink *devlink,
+ enum devlink_command cmd, u32 portid,
+ u32 seq, int flags,
+ struct devlink_region *region)
+{
+ void *hdr;
+ int err;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ err = devlink_nl_put_handle(msg, devlink);
+ if (err)
+ goto nla_put_failure;
+
+ err = nla_put_string(msg, DEVLINK_ATTR_REGION_NAME, region->name);
+ if (err)
+ goto nla_put_failure;
+
+ err = nla_put_u64_64bit(msg, DEVLINK_ATTR_REGION_SIZE,
+ region->size,
+ DEVLINK_ATTR_PAD);
+ if (err)
+ goto nla_put_failure;
+
+ err = devlink_nl_region_snapshots_id_put(msg, devlink, region);
+ if (err)
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return err;
+}
+
+static void devlink_nl_region_notify(struct devlink_region *region,
+ struct devlink_snapshot *snapshot,
+ enum devlink_command cmd)
+{
+ struct devlink *devlink = region->devlink;
+ struct sk_buff *msg;
+ void *hdr;
+ int err;
+
+ WARN_ON(cmd != DEVLINK_CMD_REGION_NEW && cmd != DEVLINK_CMD_REGION_DEL);
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return;
+
+ hdr = genlmsg_put(msg, 0, 0, &devlink_nl_family, 0, cmd);
+ if (!hdr)
+ goto out_free_msg;
+
+ err = devlink_nl_put_handle(msg, devlink);
+ if (err)
+ goto out_cancel_msg;
+
+ err = nla_put_string(msg, DEVLINK_ATTR_REGION_NAME,
+ region->name);
+ if (err)
+ goto out_cancel_msg;
+
+ if (snapshot) {
+ err = nla_put_u32(msg, DEVLINK_ATTR_REGION_SNAPSHOT_ID,
+ snapshot->id);
+ if (err)
+ goto out_cancel_msg;
+ } else {
+ err = nla_put_u64_64bit(msg, DEVLINK_ATTR_REGION_SIZE,
+ region->size, DEVLINK_ATTR_PAD);
+ if (err)
+ goto out_cancel_msg;
+ }
+ genlmsg_end(msg, hdr);
+
+ genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink),
+ msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
+
+ return;
+
+out_cancel_msg:
+ genlmsg_cancel(msg, hdr);
+out_free_msg:
+ nlmsg_free(msg);
+}
+
+static int devlink_nl_cmd_region_get_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_region *region;
+ const char *region_name;
+ struct sk_buff *msg;
+ int err;
+
+ if (!info->attrs[DEVLINK_ATTR_REGION_NAME])
+ return -EINVAL;
+
+ region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]);
+ region = devlink_region_get_by_name(devlink, region_name);
+ if (!region)
+ return -EINVAL;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_region_fill(msg, devlink, DEVLINK_CMD_REGION_GET,
+ info->snd_portid, info->snd_seq, 0,
+ region);
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static int devlink_nl_cmd_region_get_dumpit(struct sk_buff *msg,
+ struct netlink_callback *cb)
+{
+ struct devlink_region *region;
+ struct devlink *devlink;
+ int start = cb->args[0];
+ int idx = 0;
+ int err;
+
+ mutex_lock(&devlink_mutex);
+ list_for_each_entry(devlink, &devlink_list, list) {
+ if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+ continue;
+
+ mutex_lock(&devlink->lock);
+ list_for_each_entry(region, &devlink->region_list, list) {
+ if (idx < start) {
+ idx++;
+ continue;
+ }
+ err = devlink_nl_region_fill(msg, devlink,
+ DEVLINK_CMD_REGION_GET,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NLM_F_MULTI, region);
+ if (err) {
+ mutex_unlock(&devlink->lock);
+ goto out;
+ }
+ idx++;
+ }
+ mutex_unlock(&devlink->lock);
+ }
+out:
+ mutex_unlock(&devlink_mutex);
+ cb->args[0] = idx;
+ return msg->len;
+}
+
+static int devlink_nl_cmd_region_del(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_snapshot *snapshot;
+ struct devlink_region *region;
+ const char *region_name;
+ u32 snapshot_id;
+
+ if (!info->attrs[DEVLINK_ATTR_REGION_NAME] ||
+ !info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID])
+ return -EINVAL;
+
+ region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]);
+ snapshot_id = nla_get_u32(info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]);
+
+ region = devlink_region_get_by_name(devlink, region_name);
+ if (!region)
+ return -EINVAL;
+
+ snapshot = devlink_region_snapshot_get_by_id(region, snapshot_id);
+ if (!snapshot)
+ return -EINVAL;
+
+ devlink_nl_region_notify(region, snapshot, DEVLINK_CMD_REGION_DEL);
+ devlink_region_snapshot_del(snapshot);
+ return 0;
+}
+
+static int devlink_nl_cmd_region_read_chunk_fill(struct sk_buff *msg,
+ struct devlink *devlink,
+ u8 *chunk, u32 chunk_size,
+ u64 addr)
+{
+ struct nlattr *chunk_attr;
+ int err;
+
+ chunk_attr = nla_nest_start(msg, DEVLINK_ATTR_REGION_CHUNK);
+ if (!chunk_attr)
+ return -EINVAL;
+
+ err = nla_put(msg, DEVLINK_ATTR_REGION_CHUNK_DATA, chunk_size, chunk);
+ if (err)
+ goto nla_put_failure;
+
+ err = nla_put_u64_64bit(msg, DEVLINK_ATTR_REGION_CHUNK_ADDR, addr,
+ DEVLINK_ATTR_PAD);
+ if (err)
+ goto nla_put_failure;
+
+ nla_nest_end(msg, chunk_attr);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(msg, chunk_attr);
+ return err;
+}
+
+#define DEVLINK_REGION_READ_CHUNK_SIZE 256
+
+static int devlink_nl_region_read_snapshot_fill(struct sk_buff *skb,
+ struct devlink *devlink,
+ struct devlink_region *region,
+ struct nlattr **attrs,
+ u64 start_offset,
+ u64 end_offset,
+ bool dump,
+ u64 *new_offset)
+{
+ struct devlink_snapshot *snapshot;
+ u64 curr_offset = start_offset;
+ u32 snapshot_id;
+ int err = 0;
+
+ *new_offset = start_offset;
+
+ snapshot_id = nla_get_u32(attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]);
+ snapshot = devlink_region_snapshot_get_by_id(region, snapshot_id);
+ if (!snapshot)
+ return -EINVAL;
+
+ if (end_offset > snapshot->data_len || dump)
+ end_offset = snapshot->data_len;
+
+ while (curr_offset < end_offset) {
+ u32 data_size;
+ u8 *data;
+
+ if (end_offset - curr_offset < DEVLINK_REGION_READ_CHUNK_SIZE)
+ data_size = end_offset - curr_offset;
+ else
+ data_size = DEVLINK_REGION_READ_CHUNK_SIZE;
+
+ data = &snapshot->data[curr_offset];
+ err = devlink_nl_cmd_region_read_chunk_fill(skb, devlink,
+ data, data_size,
+ curr_offset);
+ if (err)
+ break;
+
+ curr_offset += data_size;
+ }
+ *new_offset = curr_offset;
+
+ return err;
+}
+
+static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ u64 ret_offset, start_offset, end_offset = 0;
+ struct nlattr *attrs[DEVLINK_ATTR_MAX + 1];
+ const struct genl_ops *ops = cb->data;
+ struct devlink_region *region;
+ struct nlattr *chunks_attr;
+ const char *region_name;
+ struct devlink *devlink;
+ bool dump = true;
+ void *hdr;
+ int err;
+
+ start_offset = *((u64 *)&cb->args[0]);
+
+ err = nlmsg_parse(cb->nlh, GENL_HDRLEN + devlink_nl_family.hdrsize,
+ attrs, DEVLINK_ATTR_MAX, ops->policy, NULL);
+ if (err)
+ goto out;
+
+ devlink = devlink_get_from_attrs(sock_net(cb->skb->sk), attrs);
+ if (IS_ERR(devlink))
+ goto out;
+
+ mutex_lock(&devlink_mutex);
+ mutex_lock(&devlink->lock);
+
+ if (!attrs[DEVLINK_ATTR_REGION_NAME] ||
+ !attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID])
+ goto out_unlock;
+
+ region_name = nla_data(attrs[DEVLINK_ATTR_REGION_NAME]);
+ region = devlink_region_get_by_name(devlink, region_name);
+ if (!region)
+ goto out_unlock;
+
+ hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ &devlink_nl_family, NLM_F_ACK | NLM_F_MULTI,
+ DEVLINK_CMD_REGION_READ);
+ if (!hdr)
+ goto out_unlock;
+
+ err = devlink_nl_put_handle(skb, devlink);
+ if (err)
+ goto nla_put_failure;
+
+ err = nla_put_string(skb, DEVLINK_ATTR_REGION_NAME, region_name);
+ if (err)
+ goto nla_put_failure;
+
+ chunks_attr = nla_nest_start(skb, DEVLINK_ATTR_REGION_CHUNKS);
+ if (!chunks_attr)
+ goto nla_put_failure;
+
+ if (attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR] &&
+ attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]) {
+ if (!start_offset)
+ start_offset =
+ nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]);
+
+ end_offset = nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]);
+ end_offset += nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]);
+ dump = false;
+ }
+
+ err = devlink_nl_region_read_snapshot_fill(skb, devlink,
+ region, attrs,
+ start_offset,
+ end_offset, dump,
+ &ret_offset);
+
+ if (err && err != -EMSGSIZE)
+ goto nla_put_failure;
+
+ /* Check if there was any progress done to prevent infinite loop */
+ if (ret_offset == start_offset)
+ goto nla_put_failure;
+
+ *((u64 *)&cb->args[0]) = ret_offset;
+
+ nla_nest_end(skb, chunks_attr);
+ genlmsg_end(skb, hdr);
+ mutex_unlock(&devlink->lock);
+ mutex_unlock(&devlink_mutex);
+
+ return skb->len;
+
+nla_put_failure:
+ genlmsg_cancel(skb, hdr);
+out_unlock:
+ mutex_unlock(&devlink->lock);
+ mutex_unlock(&devlink_mutex);
+out:
+ return 0;
+}
+
static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
[DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING },
[DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING },
@@ -2624,6 +3588,11 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
[DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED] = { .type = NLA_U8 },
[DEVLINK_ATTR_RESOURCE_ID] = { .type = NLA_U64},
[DEVLINK_ATTR_RESOURCE_SIZE] = { .type = NLA_U64},
+ [DEVLINK_ATTR_PARAM_NAME] = { .type = NLA_NUL_STRING },
+ [DEVLINK_ATTR_PARAM_TYPE] = { .type = NLA_U8 },
+ [DEVLINK_ATTR_PARAM_VALUE_CMODE] = { .type = NLA_U8 },
+ [DEVLINK_ATTR_REGION_NAME] = { .type = NLA_NUL_STRING },
+ [DEVLINK_ATTR_REGION_SNAPSHOT_ID] = { .type = NLA_U32 },
};
static const struct genl_ops devlink_nl_ops[] = {
@@ -2807,6 +3776,43 @@ static const struct genl_ops devlink_nl_ops[] = {
.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
DEVLINK_NL_FLAG_NO_LOCK,
},
+ {
+ .cmd = DEVLINK_CMD_PARAM_GET,
+ .doit = devlink_nl_cmd_param_get_doit,
+ .dumpit = devlink_nl_cmd_param_get_dumpit,
+ .policy = devlink_nl_policy,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+ /* can be retrieved by unprivileged users */
+ },
+ {
+ .cmd = DEVLINK_CMD_PARAM_SET,
+ .doit = devlink_nl_cmd_param_set_doit,
+ .policy = devlink_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+ },
+ {
+ .cmd = DEVLINK_CMD_REGION_GET,
+ .doit = devlink_nl_cmd_region_get_doit,
+ .dumpit = devlink_nl_cmd_region_get_dumpit,
+ .policy = devlink_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+ },
+ {
+ .cmd = DEVLINK_CMD_REGION_DEL,
+ .doit = devlink_nl_cmd_region_del,
+ .policy = devlink_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+ },
+ {
+ .cmd = DEVLINK_CMD_REGION_READ,
+ .dumpit = devlink_nl_cmd_region_read_dumpit,
+ .policy = devlink_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+ },
};
static struct genl_family devlink_nl_family __ro_after_init = {
@@ -2845,6 +3851,8 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size)
INIT_LIST_HEAD(&devlink->sb_list);
INIT_LIST_HEAD_RCU(&devlink->dpipe_table_list);
INIT_LIST_HEAD(&devlink->resource_list);
+ INIT_LIST_HEAD(&devlink->param_list);
+ INIT_LIST_HEAD(&devlink->region_list);
mutex_init(&devlink->lock);
return devlink;
}
@@ -3434,6 +4442,320 @@ out:
}
EXPORT_SYMBOL_GPL(devlink_resource_occ_get_unregister);
+/**
+ * devlink_params_register - register configuration parameters
+ *
+ * @devlink: devlink
+ * @params: configuration parameters array
+ * @params_count: number of parameters provided
+ *
+ * Register the configuration parameters supported by the driver.
+ */
+int devlink_params_register(struct devlink *devlink,
+ const struct devlink_param *params,
+ size_t params_count)
+{
+ const struct devlink_param *param = params;
+ int i;
+ int err;
+
+ mutex_lock(&devlink->lock);
+ for (i = 0; i < params_count; i++, param++) {
+ if (!param || !param->name || !param->supported_cmodes) {
+ err = -EINVAL;
+ goto rollback;
+ }
+ if (param->generic) {
+ err = devlink_param_generic_verify(param);
+ if (err)
+ goto rollback;
+ } else {
+ err = devlink_param_driver_verify(param);
+ if (err)
+ goto rollback;
+ }
+ err = devlink_param_register_one(devlink, param);
+ if (err)
+ goto rollback;
+ }
+
+ mutex_unlock(&devlink->lock);
+ return 0;
+
+rollback:
+ if (!i)
+ goto unlock;
+ for (param--; i > 0; i--, param--)
+ devlink_param_unregister_one(devlink, param);
+unlock:
+ mutex_unlock(&devlink->lock);
+ return err;
+}
+EXPORT_SYMBOL_GPL(devlink_params_register);
+
+/**
+ * devlink_params_unregister - unregister configuration parameters
+ * @devlink: devlink
+ * @params: configuration parameters to unregister
+ * @params_count: number of parameters provided
+ */
+void devlink_params_unregister(struct devlink *devlink,
+ const struct devlink_param *params,
+ size_t params_count)
+{
+ const struct devlink_param *param = params;
+ int i;
+
+ mutex_lock(&devlink->lock);
+ for (i = 0; i < params_count; i++, param++)
+ devlink_param_unregister_one(devlink, param);
+ mutex_unlock(&devlink->lock);
+}
+EXPORT_SYMBOL_GPL(devlink_params_unregister);
+
+/**
+ * devlink_param_driverinit_value_get - get configuration parameter
+ * value for driver initializing
+ *
+ * @devlink: devlink
+ * @param_id: parameter ID
+ * @init_val: value of parameter in driverinit configuration mode
+ *
+ * This function should be used by the driver to get driverinit
+ * configuration for initialization after reload command.
+ */
+int devlink_param_driverinit_value_get(struct devlink *devlink, u32 param_id,
+ union devlink_param_value *init_val)
+{
+ struct devlink_param_item *param_item;
+
+ if (!devlink->ops || !devlink->ops->reload)
+ return -EOPNOTSUPP;
+
+ param_item = devlink_param_find_by_id(&devlink->param_list, param_id);
+ if (!param_item)
+ return -EINVAL;
+
+ if (!param_item->driverinit_value_valid ||
+ !devlink_param_cmode_is_supported(param_item->param,
+ DEVLINK_PARAM_CMODE_DRIVERINIT))
+ return -EOPNOTSUPP;
+
+ *init_val = param_item->driverinit_value;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(devlink_param_driverinit_value_get);
+
+/**
+ * devlink_param_driverinit_value_set - set value of configuration
+ * parameter for driverinit
+ * configuration mode
+ *
+ * @devlink: devlink
+ * @param_id: parameter ID
+ * @init_val: value of parameter to set for driverinit configuration mode
+ *
+ * This function should be used by the driver to set driverinit
+ * configuration mode default value.
+ */
+int devlink_param_driverinit_value_set(struct devlink *devlink, u32 param_id,
+ union devlink_param_value init_val)
+{
+ struct devlink_param_item *param_item;
+
+ param_item = devlink_param_find_by_id(&devlink->param_list, param_id);
+ if (!param_item)
+ return -EINVAL;
+
+ if (!devlink_param_cmode_is_supported(param_item->param,
+ DEVLINK_PARAM_CMODE_DRIVERINIT))
+ return -EOPNOTSUPP;
+
+ param_item->driverinit_value = init_val;
+ param_item->driverinit_value_valid = true;
+
+ devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_NEW);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(devlink_param_driverinit_value_set);
+
+/**
+ * devlink_param_value_changed - notify devlink on a parameter's value
+ * change. Should be called by the driver
+ * right after the change.
+ *
+ * @devlink: devlink
+ * @param_id: parameter ID
+ *
+ * This function should be used by the driver to notify devlink on value
+ * change, excluding driverinit configuration mode.
+ * For driverinit configuration mode driver should use the function
+ * devlink_param_driverinit_value_set() instead.
+ */
+void devlink_param_value_changed(struct devlink *devlink, u32 param_id)
+{
+ struct devlink_param_item *param_item;
+
+ param_item = devlink_param_find_by_id(&devlink->param_list, param_id);
+ WARN_ON(!param_item);
+
+ devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_NEW);
+}
+EXPORT_SYMBOL_GPL(devlink_param_value_changed);
+
+/**
+ * devlink_region_create - create a new address region
+ *
+ * @devlink: devlink
+ * @region_name: region name
+ * @region_max_snapshots: Maximum supported number of snapshots for region
+ * @region_size: size of region
+ */
+struct devlink_region *devlink_region_create(struct devlink *devlink,
+ const char *region_name,
+ u32 region_max_snapshots,
+ u64 region_size)
+{
+ struct devlink_region *region;
+ int err = 0;
+
+ mutex_lock(&devlink->lock);
+
+ if (devlink_region_get_by_name(devlink, region_name)) {
+ err = -EEXIST;
+ goto unlock;
+ }
+
+ region = kzalloc(sizeof(*region), GFP_KERNEL);
+ if (!region) {
+ err = -ENOMEM;
+ goto unlock;
+ }
+
+ region->devlink = devlink;
+ region->max_snapshots = region_max_snapshots;
+ region->name = region_name;
+ region->size = region_size;
+ INIT_LIST_HEAD(&region->snapshot_list);
+ list_add_tail(&region->list, &devlink->region_list);
+ devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_NEW);
+
+ mutex_unlock(&devlink->lock);
+ return region;
+
+unlock:
+ mutex_unlock(&devlink->lock);
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(devlink_region_create);
+
+/**
+ * devlink_region_destroy - destroy address region
+ *
+ * @region: devlink region to destroy
+ */
+void devlink_region_destroy(struct devlink_region *region)
+{
+ struct devlink *devlink = region->devlink;
+ struct devlink_snapshot *snapshot, *ts;
+
+ mutex_lock(&devlink->lock);
+
+ /* Free all snapshots of region */
+ list_for_each_entry_safe(snapshot, ts, &region->snapshot_list, list)
+ devlink_region_snapshot_del(snapshot);
+
+ list_del(&region->list);
+
+ devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_DEL);
+ mutex_unlock(&devlink->lock);
+ kfree(region);
+}
+EXPORT_SYMBOL_GPL(devlink_region_destroy);
+
+/**
+ * devlink_region_shapshot_id_get - get snapshot ID
+ *
+ * This callback should be called when adding a new snapshot,
+ * Driver should use the same id for multiple snapshots taken
+ * on multiple regions at the same time/by the same trigger.
+ *
+ * @devlink: devlink
+ */
+u32 devlink_region_shapshot_id_get(struct devlink *devlink)
+{
+ u32 id;
+
+ mutex_lock(&devlink->lock);
+ id = ++devlink->snapshot_id;
+ mutex_unlock(&devlink->lock);
+
+ return id;
+}
+EXPORT_SYMBOL_GPL(devlink_region_shapshot_id_get);
+
+/**
+ * devlink_region_snapshot_create - create a new snapshot
+ * This will add a new snapshot of a region. The snapshot
+ * will be stored on the region struct and can be accessed
+ * from devlink. This is useful for future analyses of snapshots.
+ * Multiple snapshots can be created on a region.
+ * The @snapshot_id should be obtained using the getter function.
+ *
+ * @devlink_region: devlink region of the snapshot
+ * @data_len: size of snapshot data
+ * @data: snapshot data
+ * @snapshot_id: snapshot id to be created
+ * @data_destructor: pointer to destructor function to free data
+ */
+int devlink_region_snapshot_create(struct devlink_region *region, u64 data_len,
+ u8 *data, u32 snapshot_id,
+ devlink_snapshot_data_dest_t *data_destructor)
+{
+ struct devlink *devlink = region->devlink;
+ struct devlink_snapshot *snapshot;
+ int err;
+
+ mutex_lock(&devlink->lock);
+
+ /* check if region can hold one more snapshot */
+ if (region->cur_snapshots == region->max_snapshots) {
+ err = -ENOMEM;
+ goto unlock;
+ }
+
+ if (devlink_region_snapshot_get_by_id(region, snapshot_id)) {
+ err = -EEXIST;
+ goto unlock;
+ }
+
+ snapshot = kzalloc(sizeof(*snapshot), GFP_KERNEL);
+ if (!snapshot) {
+ err = -ENOMEM;
+ goto unlock;
+ }
+
+ snapshot->id = snapshot_id;
+ snapshot->region = region;
+ snapshot->data = data;
+ snapshot->data_len = data_len;
+ snapshot->data_destructor = data_destructor;
+
+ list_add_tail(&snapshot->list, &region->snapshot_list);
+
+ region->cur_snapshots++;
+
+ devlink_nl_region_notify(region, snapshot, DEVLINK_CMD_REGION_NEW);
+ mutex_unlock(&devlink->lock);
+ return 0;
+
+unlock:
+ mutex_unlock(&devlink->lock);
+ return err;
+}
+EXPORT_SYMBOL_GPL(devlink_region_snapshot_create);
+
static int __init devlink_module_init(void)
{
return genl_register_family(&devlink_nl_family);
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index e677a20180cf..c9993c6c2fd4 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -111,6 +111,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
[NETIF_F_RX_UDP_TUNNEL_PORT_BIT] = "rx-udp_tunnel-port-offload",
[NETIF_F_HW_TLS_RECORD_BIT] = "tls-hw-record",
[NETIF_F_HW_TLS_TX_BIT] = "tls-hw-tx-offload",
+ [NETIF_F_HW_TLS_RX_BIT] = "tls-hw-rx-offload",
};
static const char
diff --git a/net/core/filter.c b/net/core/filter.c
index 06da770f543f..104d560946da 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3679,7 +3679,7 @@ BPF_CALL_3(bpf_skb_set_tunnel_opt, struct sk_buff *, skb,
if (unlikely(size > IP_TUNNEL_OPTS_MAX))
return -ENOMEM;
- ip_tunnel_info_opts_set(info, from, size);
+ ip_tunnel_info_opts_set(info, from, size, TUNNEL_OPTIONS_PRESENT);
return 0;
}
@@ -4751,6 +4751,7 @@ bpf_base_func_proto(enum bpf_func_id func_id)
case BPF_FUNC_trace_printk:
if (capable(CAP_SYS_ADMIN))
return bpf_get_trace_printk_proto();
+ /* else: fall through */
default:
return NULL;
}
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 53f96e4f7bf5..08a5184f4b34 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -152,7 +152,9 @@ skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
!dissector_uses_key(flow_dissector,
FLOW_DISSECTOR_KEY_ENC_CONTROL) &&
!dissector_uses_key(flow_dissector,
- FLOW_DISSECTOR_KEY_ENC_PORTS))
+ FLOW_DISSECTOR_KEY_ENC_PORTS) &&
+ !dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_ENC_IP))
return;
info = skb_tunnel_info(skb);
@@ -212,6 +214,16 @@ skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
tp->src = key->tp_src;
tp->dst = key->tp_dst;
}
+
+ if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IP)) {
+ struct flow_dissector_key_ip *ip;
+
+ ip = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_ENC_IP,
+ target_container);
+ ip->tos = key->tos;
+ ip->ttl = key->ttl;
+ }
}
EXPORT_SYMBOL(skb_flow_dissect_tunnel_info);
@@ -589,7 +601,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
struct flow_dissector_key_tags *key_tags;
struct flow_dissector_key_vlan *key_vlan;
enum flow_dissect_ret fdret;
- bool skip_vlan = false;
+ enum flow_dissector_key_id dissector_vlan = FLOW_DISSECTOR_KEY_MAX;
int num_hdrs = 0;
u8 ip_proto = 0;
bool ret;
@@ -748,14 +760,14 @@ proto_again:
}
case htons(ETH_P_8021AD):
case htons(ETH_P_8021Q): {
- const struct vlan_hdr *vlan;
+ const struct vlan_hdr *vlan = NULL;
struct vlan_hdr _vlan;
- bool vlan_tag_present = skb && skb_vlan_tag_present(skb);
+ __be16 saved_vlan_tpid = proto;
- if (vlan_tag_present)
+ if (dissector_vlan == FLOW_DISSECTOR_KEY_MAX &&
+ skb && skb_vlan_tag_present(skb)) {
proto = skb->protocol;
-
- if (!vlan_tag_present || eth_type_vlan(skb->protocol)) {
+ } else {
vlan = __skb_header_pointer(skb, nhoff, sizeof(_vlan),
data, hlen, &_vlan);
if (!vlan) {
@@ -765,20 +777,23 @@ proto_again:
proto = vlan->h_vlan_encapsulated_proto;
nhoff += sizeof(*vlan);
- if (skip_vlan) {
- fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
- break;
- }
}
- skip_vlan = true;
- if (dissector_uses_key(flow_dissector,
- FLOW_DISSECTOR_KEY_VLAN)) {
+ if (dissector_vlan == FLOW_DISSECTOR_KEY_MAX) {
+ dissector_vlan = FLOW_DISSECTOR_KEY_VLAN;
+ } else if (dissector_vlan == FLOW_DISSECTOR_KEY_VLAN) {
+ dissector_vlan = FLOW_DISSECTOR_KEY_CVLAN;
+ } else {
+ fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
+ break;
+ }
+
+ if (dissector_uses_key(flow_dissector, dissector_vlan)) {
key_vlan = skb_flow_dissector_target(flow_dissector,
- FLOW_DISSECTOR_KEY_VLAN,
+ dissector_vlan,
target_container);
- if (vlan_tag_present) {
+ if (!vlan) {
key_vlan->vlan_id = skb_vlan_tag_get_id(skb);
key_vlan->vlan_priority =
(skb_vlan_tag_get_prio(skb) >> VLAN_PRIO_SHIFT);
@@ -789,6 +804,7 @@ proto_again:
(ntohs(vlan->h_vlan_TCI) &
VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT;
}
+ key_vlan->vlan_tpid = saved_vlan_tpid;
}
fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 8e3fda9e725c..cbe85d8d4cc2 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1148,7 +1148,8 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
neigh->nud_state = new;
err = 0;
notify = old & NUD_VALID;
- if ((old & (NUD_INCOMPLETE | NUD_PROBE)) &&
+ if (((old & (NUD_INCOMPLETE | NUD_PROBE)) ||
+ (flags & NEIGH_UPDATE_F_ADMIN)) &&
(new & NUD_FAILED)) {
neigh_invalidate(neigh);
notify = 1;
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index bb7e80f4ced3..ffa1d18f2c2c 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -1047,13 +1047,30 @@ static ssize_t traffic_class_show(struct netdev_queue *queue,
char *buf)
{
struct net_device *dev = queue->dev;
- int index = get_netdev_queue_index(queue);
- int tc = netdev_txq_to_tc(dev, index);
+ int index;
+ int tc;
+ if (!netif_is_multiqueue(dev))
+ return -ENOENT;
+
+ index = get_netdev_queue_index(queue);
+
+ /* If queue belongs to subordinate dev use its TC mapping */
+ dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
+
+ tc = netdev_txq_to_tc(dev, index);
if (tc < 0)
return -EINVAL;
- return sprintf(buf, "%u\n", tc);
+ /* We can report the traffic class one of two ways:
+ * Subordinate device traffic classes are reported with the traffic
+ * class first, and then the subordinate class so for example TC0 on
+ * subordinate device 2 will be reported as "0-2". If the queue
+ * belongs to the root device it will be reported with just the
+ * traffic class, so just "0" for TC 0 for example.
+ */
+ return dev->num_tc < 0 ? sprintf(buf, "%u%d\n", tc, dev->num_tc) :
+ sprintf(buf, "%u\n", tc);
}
#ifdef CONFIG_XPS
@@ -1214,10 +1231,20 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue,
cpumask_var_t mask;
unsigned long index;
+ if (!netif_is_multiqueue(dev))
+ return -ENOENT;
+
index = get_netdev_queue_index(queue);
if (dev->num_tc) {
+ /* Do not allow XPS on subordinate device directly */
num_tc = dev->num_tc;
+ if (num_tc < 0)
+ return -EINVAL;
+
+ /* If queue belongs to subordinate dev use its map */
+ dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
+
tc = netdev_txq_to_tc(dev, index);
if (tc < 0)
return -EINVAL;
@@ -1227,13 +1254,13 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue,
return -ENOMEM;
rcu_read_lock();
- dev_maps = rcu_dereference(dev->xps_maps);
+ dev_maps = rcu_dereference(dev->xps_cpus_map);
if (dev_maps) {
for_each_possible_cpu(cpu) {
int i, tci = cpu * num_tc + tc;
struct xps_map *map;
- map = rcu_dereference(dev_maps->cpu_map[tci]);
+ map = rcu_dereference(dev_maps->attr_map[tci]);
if (!map)
continue;
@@ -1260,6 +1287,9 @@ static ssize_t xps_cpus_store(struct netdev_queue *queue,
cpumask_var_t mask;
int err;
+ if (!netif_is_multiqueue(dev))
+ return -ENOENT;
+
if (!capable(CAP_NET_ADMIN))
return -EPERM;
@@ -1283,6 +1313,88 @@ static ssize_t xps_cpus_store(struct netdev_queue *queue,
static struct netdev_queue_attribute xps_cpus_attribute __ro_after_init
= __ATTR_RW(xps_cpus);
+
+static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf)
+{
+ struct net_device *dev = queue->dev;
+ struct xps_dev_maps *dev_maps;
+ unsigned long *mask, index;
+ int j, len, num_tc = 1, tc = 0;
+
+ index = get_netdev_queue_index(queue);
+
+ if (dev->num_tc) {
+ num_tc = dev->num_tc;
+ tc = netdev_txq_to_tc(dev, index);
+ if (tc < 0)
+ return -EINVAL;
+ }
+ mask = kcalloc(BITS_TO_LONGS(dev->num_rx_queues), sizeof(long),
+ GFP_KERNEL);
+ if (!mask)
+ return -ENOMEM;
+
+ rcu_read_lock();
+ dev_maps = rcu_dereference(dev->xps_rxqs_map);
+ if (!dev_maps)
+ goto out_no_maps;
+
+ for (j = -1; j = netif_attrmask_next(j, NULL, dev->num_rx_queues),
+ j < dev->num_rx_queues;) {
+ int i, tci = j * num_tc + tc;
+ struct xps_map *map;
+
+ map = rcu_dereference(dev_maps->attr_map[tci]);
+ if (!map)
+ continue;
+
+ for (i = map->len; i--;) {
+ if (map->queues[i] == index) {
+ set_bit(j, mask);
+ break;
+ }
+ }
+ }
+out_no_maps:
+ rcu_read_unlock();
+
+ len = bitmap_print_to_pagebuf(false, buf, mask, dev->num_rx_queues);
+ kfree(mask);
+
+ return len < PAGE_SIZE ? len : -EINVAL;
+}
+
+static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf,
+ size_t len)
+{
+ struct net_device *dev = queue->dev;
+ struct net *net = dev_net(dev);
+ unsigned long *mask, index;
+ int err;
+
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ mask = kcalloc(BITS_TO_LONGS(dev->num_rx_queues), sizeof(long),
+ GFP_KERNEL);
+ if (!mask)
+ return -ENOMEM;
+
+ index = get_netdev_queue_index(queue);
+
+ err = bitmap_parse(buf, len, mask, dev->num_rx_queues);
+ if (err) {
+ kfree(mask);
+ return err;
+ }
+
+ err = __netif_set_xps_queue(dev, mask, index, true);
+ kfree(mask);
+ return err ? : len;
+}
+
+static struct netdev_queue_attribute xps_rxqs_attribute __ro_after_init
+ = __ATTR_RW(xps_rxqs);
#endif /* CONFIG_XPS */
static struct attribute *netdev_queue_default_attrs[] __ro_after_init = {
@@ -1290,6 +1402,7 @@ static struct attribute *netdev_queue_default_attrs[] __ro_after_init = {
&queue_traffic_class.attr,
#ifdef CONFIG_XPS
&xps_cpus_attribute.attr,
+ &xps_rxqs_attribute.attr,
&queue_tx_maxrate.attr,
#endif
NULL
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 49368e21d228..308ed04984de 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -1265,7 +1265,7 @@ static ssize_t pktgen_if_write(struct file *file,
buf[len] = 0;
if (strcmp(buf, pkt_dev->dst_min) != 0) {
memset(pkt_dev->dst_min, 0, sizeof(pkt_dev->dst_min));
- strncpy(pkt_dev->dst_min, buf, len);
+ strcpy(pkt_dev->dst_min, buf);
pkt_dev->daddr_min = in_aton(pkt_dev->dst_min);
pkt_dev->cur_daddr = pkt_dev->daddr_min;
}
@@ -1280,14 +1280,12 @@ static ssize_t pktgen_if_write(struct file *file,
if (len < 0)
return len;
-
if (copy_from_user(buf, &user_buffer[i], len))
return -EFAULT;
-
buf[len] = 0;
if (strcmp(buf, pkt_dev->dst_max) != 0) {
memset(pkt_dev->dst_max, 0, sizeof(pkt_dev->dst_max));
- strncpy(pkt_dev->dst_max, buf, len);
+ strcpy(pkt_dev->dst_max, buf);
pkt_dev->daddr_max = in_aton(pkt_dev->dst_max);
pkt_dev->cur_daddr = pkt_dev->daddr_max;
}
@@ -1396,7 +1394,7 @@ static ssize_t pktgen_if_write(struct file *file,
buf[len] = 0;
if (strcmp(buf, pkt_dev->src_min) != 0) {
memset(pkt_dev->src_min, 0, sizeof(pkt_dev->src_min));
- strncpy(pkt_dev->src_min, buf, len);
+ strcpy(pkt_dev->src_min, buf);
pkt_dev->saddr_min = in_aton(pkt_dev->src_min);
pkt_dev->cur_saddr = pkt_dev->saddr_min;
}
@@ -1416,7 +1414,7 @@ static ssize_t pktgen_if_write(struct file *file,
buf[len] = 0;
if (strcmp(buf, pkt_dev->src_max) != 0) {
memset(pkt_dev->src_max, 0, sizeof(pkt_dev->src_max));
- strncpy(pkt_dev->src_max, buf, len);
+ strcpy(pkt_dev->src_max, buf);
pkt_dev->saddr_max = in_aton(pkt_dev->src_max);
pkt_dev->cur_saddr = pkt_dev->saddr_max;
}
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 5ef61222fdef..92b6fa5d5f6e 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -964,7 +964,8 @@ static size_t rtnl_xdp_size(void)
{
size_t xdp_size = nla_total_size(0) + /* nest IFLA_XDP */
nla_total_size(1) + /* XDP_ATTACHED */
- nla_total_size(4); /* XDP_PROG_ID */
+ nla_total_size(4) + /* XDP_PROG_ID (or 1st mode) */
+ nla_total_size(4); /* XDP_<mode>_PROG_ID */
return xdp_size;
}
@@ -1353,27 +1354,51 @@ static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev)
return 0;
}
-static u8 rtnl_xdp_attached_mode(struct net_device *dev, u32 *prog_id)
+static u32 rtnl_xdp_prog_skb(struct net_device *dev)
{
- const struct net_device_ops *ops = dev->netdev_ops;
const struct bpf_prog *generic_xdp_prog;
- struct netdev_bpf xdp;
ASSERT_RTNL();
- *prog_id = 0;
generic_xdp_prog = rtnl_dereference(dev->xdp_prog);
- if (generic_xdp_prog) {
- *prog_id = generic_xdp_prog->aux->id;
- return XDP_ATTACHED_SKB;
- }
- if (!ops->ndo_bpf)
- return XDP_ATTACHED_NONE;
+ if (!generic_xdp_prog)
+ return 0;
+ return generic_xdp_prog->aux->id;
+}
+
+static u32 rtnl_xdp_prog_drv(struct net_device *dev)
+{
+ return __dev_xdp_query(dev, dev->netdev_ops->ndo_bpf, XDP_QUERY_PROG);
+}
+
+static u32 rtnl_xdp_prog_hw(struct net_device *dev)
+{
+ return __dev_xdp_query(dev, dev->netdev_ops->ndo_bpf,
+ XDP_QUERY_PROG_HW);
+}
+
+static int rtnl_xdp_report_one(struct sk_buff *skb, struct net_device *dev,
+ u32 *prog_id, u8 *mode, u8 tgt_mode, u32 attr,
+ u32 (*get_prog_id)(struct net_device *dev))
+{
+ u32 curr_id;
+ int err;
+
+ curr_id = get_prog_id(dev);
+ if (!curr_id)
+ return 0;
- __dev_xdp_query(dev, ops->ndo_bpf, &xdp);
- *prog_id = xdp.prog_id;
+ *prog_id = curr_id;
+ err = nla_put_u32(skb, attr, curr_id);
+ if (err)
+ return err;
+
+ if (*mode != XDP_ATTACHED_NONE)
+ *mode = XDP_ATTACHED_MULTI;
+ else
+ *mode = tgt_mode;
- return xdp.prog_attached;
+ return 0;
}
static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev)
@@ -1381,17 +1406,32 @@ static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev)
struct nlattr *xdp;
u32 prog_id;
int err;
+ u8 mode;
xdp = nla_nest_start(skb, IFLA_XDP);
if (!xdp)
return -EMSGSIZE;
- err = nla_put_u8(skb, IFLA_XDP_ATTACHED,
- rtnl_xdp_attached_mode(dev, &prog_id));
+ prog_id = 0;
+ mode = XDP_ATTACHED_NONE;
+ err = rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_SKB,
+ IFLA_XDP_SKB_PROG_ID, rtnl_xdp_prog_skb);
+ if (err)
+ goto err_cancel;
+ err = rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_DRV,
+ IFLA_XDP_DRV_PROG_ID, rtnl_xdp_prog_drv);
+ if (err)
+ goto err_cancel;
+ err = rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_HW,
+ IFLA_XDP_HW_PROG_ID, rtnl_xdp_prog_hw);
+ if (err)
+ goto err_cancel;
+
+ err = nla_put_u8(skb, IFLA_XDP_ATTACHED, mode);
if (err)
goto err_cancel;
- if (prog_id) {
+ if (prog_id && mode != XDP_ATTACHED_MULTI) {
err = nla_put_u32(skb, IFLA_XDP_PROG_ID, prog_id);
if (err)
goto err_cancel;
@@ -2759,9 +2799,12 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm)
return err;
}
- dev->rtnl_link_state = RTNL_LINK_INITIALIZED;
-
- __dev_notify_flags(dev, old_flags, ~0U);
+ if (dev->rtnl_link_state == RTNL_LINK_INITIALIZED) {
+ __dev_notify_flags(dev, old_flags, 0U);
+ } else {
+ dev->rtnl_link_state = RTNL_LINK_INITIALIZED;
+ __dev_notify_flags(dev, old_flags, ~0U);
+ }
return 0;
}
EXPORT_SYMBOL(rtnl_configure_link);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 8e51f8555e11..0c1a00672ba9 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3816,14 +3816,14 @@ err:
}
EXPORT_SYMBOL_GPL(skb_segment);
-int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
+int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
{
struct skb_shared_info *pinfo, *skbinfo = skb_shinfo(skb);
unsigned int offset = skb_gro_offset(skb);
unsigned int headlen = skb_headlen(skb);
unsigned int len = skb_gro_len(skb);
- struct sk_buff *lp, *p = *head;
unsigned int delta_truesize;
+ struct sk_buff *lp;
if (unlikely(p->len + len >= 65536))
return -E2BIG;
@@ -4899,7 +4899,6 @@ EXPORT_SYMBOL(skb_try_coalesce);
*/
void skb_scrub_packet(struct sk_buff *skb, bool xnet)
{
- skb->tstamp = 0;
skb->pkt_type = PACKET_HOST;
skb->skb_iif = 0;
skb->ignore_df = 0;
@@ -4912,8 +4911,8 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet)
return;
ipvs_reset(skb);
- skb_orphan(skb);
skb->mark = 0;
+ skb->tstamp = 0;
}
EXPORT_SYMBOL_GPL(skb_scrub_packet);
diff --git a/net/core/sock.c b/net/core/sock.c
index 9e8f65585b81..03fdea5b0f57 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -91,6 +91,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <asm/unaligned.h>
#include <linux/capability.h>
#include <linux/errno.h>
#include <linux/errqueue.h>
@@ -697,6 +698,7 @@ EXPORT_SYMBOL(sk_mc_loop);
int sock_setsockopt(struct socket *sock, int level, int optname,
char __user *optval, unsigned int optlen)
{
+ struct sock_txtime sk_txtime;
struct sock *sk = sock->sk;
int val;
int valbool;
@@ -1070,6 +1072,26 @@ set_rcvbuf:
}
break;
+ case SO_TXTIME:
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
+ ret = -EPERM;
+ } else if (optlen != sizeof(struct sock_txtime)) {
+ ret = -EINVAL;
+ } else if (copy_from_user(&sk_txtime, optval,
+ sizeof(struct sock_txtime))) {
+ ret = -EFAULT;
+ } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
+ ret = -EINVAL;
+ } else {
+ sock_valbool_flag(sk, SOCK_TXTIME, true);
+ sk->sk_clockid = sk_txtime.clockid;
+ sk->sk_txtime_deadline_mode =
+ !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
+ sk->sk_txtime_report_errors =
+ !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
+ }
+ break;
+
default:
ret = -ENOPROTOOPT;
break;
@@ -1115,6 +1137,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
u64 val64;
struct linger ling;
struct timeval tm;
+ struct sock_txtime txtime;
} v;
int lv = sizeof(int);
@@ -1403,6 +1426,15 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
v.val = sock_flag(sk, SOCK_ZEROCOPY);
break;
+ case SO_TXTIME:
+ lv = sizeof(v.txtime);
+ v.txtime.clockid = sk->sk_clockid;
+ v.txtime.flags |= sk->sk_txtime_deadline_mode ?
+ SOF_TXTIME_DEADLINE_MODE : 0;
+ v.txtime.flags |= sk->sk_txtime_report_errors ?
+ SOF_TXTIME_REPORT_ERRORS : 0;
+ break;
+
default:
/* We implement the SO_SNDLOWAT etc to not be settable
* (1003.1g 7).
@@ -2137,6 +2169,13 @@ int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
sockc->tsflags |= tsflags;
break;
+ case SCM_TXTIME:
+ if (!sock_flag(sk, SOCK_TXTIME))
+ return -EINVAL;
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
+ return -EINVAL;
+ sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
+ break;
/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
case SCM_RIGHTS:
case SCM_CREDENTIALS:
@@ -2401,9 +2440,10 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
{
struct proto *prot = sk->sk_prot;
long allocated = sk_memory_allocated_add(sk, amt);
+ bool charged = true;
if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
- !mem_cgroup_charge_skmem(sk->sk_memcg, amt))
+ !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt)))
goto suppress_allocation;
/* Under limit. */
@@ -2461,7 +2501,8 @@ suppress_allocation:
return 1;
}
- trace_sock_exceed_buf_limit(sk, prot, allocated);
+ if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
+ trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
sk_memory_allocated_sub(sk, amt);
@@ -2818,6 +2859,8 @@ void sock_init_data(struct socket *sock, struct sock *sk)
sk->sk_pacing_rate = ~0U;
sk->sk_pacing_shift = 10;
sk->sk_incoming_cpu = -1;
+
+ sk_rx_queue_clear(sk);
/*
* Before updating sk_refcnt, we must commit prior changes to memory
* (Documentation/RCU/rculist_nulls.txt for details)
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 9d1f22072d5d..57285383ed00 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -3,8 +3,11 @@
* Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
* Released under terms in GPL version 2. See COPYING.
*/
+#include <linux/bpf.h>
+#include <linux/filter.h>
#include <linux/types.h>
#include <linux/mm.h>
+#include <linux/netdevice.h>
#include <linux/slab.h>
#include <linux/idr.h>
#include <linux/rhashtable.h>
@@ -45,8 +48,8 @@ static u32 xdp_mem_id_hashfn(const void *data, u32 len, u32 seed)
BUILD_BUG_ON(FIELD_SIZEOF(struct xdp_mem_allocator, mem.id)
!= sizeof(u32));
- /* Use cyclic increasing ID as direct hash key, see rht_bucket_index */
- return key << RHT_HASH_RESERVED_SPACE;
+ /* Use cyclic increasing ID as direct hash key */
+ return key;
}
static int xdp_mem_id_cmp(struct rhashtable_compare_arg *arg,
@@ -370,3 +373,34 @@ void xdp_return_buff(struct xdp_buff *xdp)
__xdp_return(xdp->data, &xdp->rxq->mem, true, xdp->handle);
}
EXPORT_SYMBOL_GPL(xdp_return_buff);
+
+int xdp_attachment_query(struct xdp_attachment_info *info,
+ struct netdev_bpf *bpf)
+{
+ bpf->prog_id = info->prog ? info->prog->aux->id : 0;
+ bpf->prog_flags = info->prog ? info->flags : 0;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(xdp_attachment_query);
+
+bool xdp_attachment_flags_ok(struct xdp_attachment_info *info,
+ struct netdev_bpf *bpf)
+{
+ if (info->prog && (bpf->flags ^ info->flags) & XDP_FLAGS_MODES) {
+ NL_SET_ERR_MSG(bpf->extack,
+ "program loaded with different flags");
+ return false;
+ }
+ return true;
+}
+EXPORT_SYMBOL_GPL(xdp_attachment_flags_ok);
+
+void xdp_attachment_setup(struct xdp_attachment_info *info,
+ struct netdev_bpf *bpf)
+{
+ if (info->prog)
+ bpf_prog_put(info->prog);
+ info->prog = bpf->prog;
+ info->flags = bpf->flags;
+}
+EXPORT_SYMBOL_GPL(xdp_attachment_setup);
diff --git a/net/decnet/dn_nsp_in.c b/net/decnet/dn_nsp_in.c
index 1b2120645730..34aba55ed573 100644
--- a/net/decnet/dn_nsp_in.c
+++ b/net/decnet/dn_nsp_in.c
@@ -491,6 +491,7 @@ static void dn_nsp_disc_conf(struct sock *sk, struct sk_buff *skb)
break;
case DN_RUN:
sk->sk_shutdown |= SHUTDOWN_MASK;
+ /* fall through */
case DN_CC:
scp->state = DN_CN;
}
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index dc5d9af3dc80..a1917025e155 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -775,6 +775,20 @@ struct dsa_switch *dsa_switch_alloc(struct device *dev, size_t n)
if (!ds)
return NULL;
+ /* We avoid allocating memory outside dsa_switch
+ * if it is not needed.
+ */
+ if (n <= sizeof(ds->_bitmap) * 8) {
+ ds->bitmap = &ds->_bitmap;
+ } else {
+ ds->bitmap = devm_kcalloc(dev,
+ BITS_TO_LONGS(n),
+ sizeof(unsigned long),
+ GFP_KERNEL);
+ if (unlikely(!ds->bitmap))
+ return NULL;
+ }
+
ds->dev = dev;
ds->num_ports = n;
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 1e3b6a6d8a40..71536c435132 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -900,7 +900,7 @@ static int dsa_slave_setup_tc_block(struct net_device *dev,
switch (f->command) {
case TC_BLOCK_BIND:
- return tcf_block_cb_register(f->block, cb, dev, dev);
+ return tcf_block_cb_register(f->block, cb, dev, dev, f->extack);
case TC_BLOCK_UNBIND:
tcf_block_cb_unregister(f->block, cb, dev);
return 0;
diff --git a/net/dsa/switch.c b/net/dsa/switch.c
index b93511726069..142b294d3446 100644
--- a/net/dsa/switch.c
+++ b/net/dsa/switch.c
@@ -136,21 +136,20 @@ static int dsa_switch_mdb_add(struct dsa_switch *ds,
{
const struct switchdev_obj_port_mdb *mdb = info->mdb;
struct switchdev_trans *trans = info->trans;
- DECLARE_BITMAP(group, ds->num_ports);
int port;
/* Build a mask of Multicast group members */
- bitmap_zero(group, ds->num_ports);
+ bitmap_zero(ds->bitmap, ds->num_ports);
if (ds->index == info->sw_index)
- set_bit(info->port, group);
+ set_bit(info->port, ds->bitmap);
for (port = 0; port < ds->num_ports; port++)
if (dsa_is_dsa_port(ds, port))
- set_bit(port, group);
+ set_bit(port, ds->bitmap);
if (switchdev_trans_ph_prepare(trans))
- return dsa_switch_mdb_prepare_bitmap(ds, mdb, group);
+ return dsa_switch_mdb_prepare_bitmap(ds, mdb, ds->bitmap);
- dsa_switch_mdb_add_bitmap(ds, mdb, group);
+ dsa_switch_mdb_add_bitmap(ds, mdb, ds->bitmap);
return 0;
}
@@ -204,21 +203,20 @@ static int dsa_switch_vlan_add(struct dsa_switch *ds,
{
const struct switchdev_obj_port_vlan *vlan = info->vlan;
struct switchdev_trans *trans = info->trans;
- DECLARE_BITMAP(members, ds->num_ports);
int port;
/* Build a mask of VLAN members */
- bitmap_zero(members, ds->num_ports);
+ bitmap_zero(ds->bitmap, ds->num_ports);
if (ds->index == info->sw_index)
- set_bit(info->port, members);
+ set_bit(info->port, ds->bitmap);
for (port = 0; port < ds->num_ports; port++)
if (dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port))
- set_bit(port, members);
+ set_bit(port, ds->bitmap);
if (switchdev_trans_ph_prepare(trans))
- return dsa_switch_vlan_prepare_bitmap(ds, vlan, members);
+ return dsa_switch_vlan_prepare_bitmap(ds, vlan, ds->bitmap);
- dsa_switch_vlan_add_bitmap(ds, vlan, members);
+ dsa_switch_vlan_add_bitmap(ds, vlan, ds->bitmap);
return 0;
}
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index ee28440f57c5..fd8faa0dfa61 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -427,13 +427,13 @@ ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len)
}
EXPORT_SYMBOL(sysfs_format_mac);
-struct sk_buff **eth_gro_receive(struct sk_buff **head,
- struct sk_buff *skb)
+struct sk_buff *eth_gro_receive(struct list_head *head, struct sk_buff *skb)
{
- struct sk_buff *p, **pp = NULL;
- struct ethhdr *eh, *eh2;
- unsigned int hlen, off_eth;
const struct packet_offload *ptype;
+ unsigned int hlen, off_eth;
+ struct sk_buff *pp = NULL;
+ struct ethhdr *eh, *eh2;
+ struct sk_buff *p;
__be16 type;
int flush = 1;
@@ -448,7 +448,7 @@ struct sk_buff **eth_gro_receive(struct sk_buff **head,
flush = 0;
- for (p = *head; p; p = p->next) {
+ list_for_each_entry(p, head, list) {
if (!NAPI_GRO_CB(p)->same_flow)
continue;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index b403499fdabe..f2a0a3bab6b5 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -229,6 +229,7 @@ int inet_listen(struct socket *sock, int backlog)
err = inet_csk_listen_start(sk, backlog);
if (err)
goto out;
+ tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_LISTEN_CB, 0, NULL);
}
sk->sk_max_ack_backlog = backlog;
err = 0;
@@ -1384,12 +1385,12 @@ out:
}
EXPORT_SYMBOL(inet_gso_segment);
-struct sk_buff **inet_gro_receive(struct sk_buff **head, struct sk_buff *skb)
+struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb)
{
const struct net_offload *ops;
- struct sk_buff **pp = NULL;
- struct sk_buff *p;
+ struct sk_buff *pp = NULL;
const struct iphdr *iph;
+ struct sk_buff *p;
unsigned int hlen;
unsigned int off;
unsigned int id;
@@ -1425,7 +1426,7 @@ struct sk_buff **inet_gro_receive(struct sk_buff **head, struct sk_buff *skb)
flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id & ~IP_DF));
id >>= 16;
- for (p = *head; p; p = p->next) {
+ list_for_each_entry(p, head, list) {
struct iphdr *iph2;
u16 flush_id;
@@ -1505,8 +1506,8 @@ out:
}
EXPORT_SYMBOL(inet_gro_receive);
-static struct sk_buff **ipip_gro_receive(struct sk_buff **head,
- struct sk_buff *skb)
+static struct sk_buff *ipip_gro_receive(struct list_head *head,
+ struct sk_buff *skb)
{
if (NAPI_GRO_CB(skb)->encap_mark) {
NAPI_GRO_CB(skb)->flush = 1;
@@ -1882,6 +1883,7 @@ fs_initcall(ipv4_offload_init);
static struct packet_type ip_packet_type __read_mostly = {
.type = cpu_to_be16(ETH_P_IP),
.func = ip_rcv,
+ .list_func = ip_list_rcv,
};
static int __init inet_init(void)
diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c
index 7cf755ef9efb..bbeecd13e534 100644
--- a/net/ipv4/esp4_offload.c
+++ b/net/ipv4/esp4_offload.c
@@ -28,8 +28,8 @@
#include <linux/spinlock.h>
#include <net/udp.h>
-static struct sk_buff **esp4_gro_receive(struct sk_buff **head,
- struct sk_buff *skb)
+static struct sk_buff *esp4_gro_receive(struct list_head *head,
+ struct sk_buff *skb)
{
int offset = skb_gro_offset(skb);
struct xfrm_offload *xo;
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index c9ec1603666b..500a59906b87 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -224,14 +224,14 @@ drop:
return 0;
}
-static struct sk_buff **fou_gro_receive(struct sock *sk,
- struct sk_buff **head,
- struct sk_buff *skb)
+static struct sk_buff *fou_gro_receive(struct sock *sk,
+ struct list_head *head,
+ struct sk_buff *skb)
{
- const struct net_offload *ops;
- struct sk_buff **pp = NULL;
u8 proto = fou_from_sock(sk)->protocol;
const struct net_offload **offloads;
+ const struct net_offload *ops;
+ struct sk_buff *pp = NULL;
/* We can clear the encap_mark for FOU as we are essentially doing
* one of two possible things. We are either adding an L4 tunnel
@@ -305,13 +305,13 @@ static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off,
return guehdr;
}
-static struct sk_buff **gue_gro_receive(struct sock *sk,
- struct sk_buff **head,
- struct sk_buff *skb)
+static struct sk_buff *gue_gro_receive(struct sock *sk,
+ struct list_head *head,
+ struct sk_buff *skb)
{
const struct net_offload **offloads;
const struct net_offload *ops;
- struct sk_buff **pp = NULL;
+ struct sk_buff *pp = NULL;
struct sk_buff *p;
struct guehdr *guehdr;
size_t len, optlen, hdrlen, off;
@@ -397,7 +397,7 @@ static struct sk_buff **gue_gro_receive(struct sock *sk,
skb_gro_pull(skb, hdrlen);
- for (p = *head; p; p = p->next) {
+ list_for_each_entry(p, head, list) {
const struct guehdr *guehdr2;
if (!NAPI_GRO_CB(p)->same_flow)
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index 6a7d980105f6..6c63524f598a 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -108,10 +108,10 @@ out:
return segs;
}
-static struct sk_buff **gre_gro_receive(struct sk_buff **head,
- struct sk_buff *skb)
+static struct sk_buff *gre_gro_receive(struct list_head *head,
+ struct sk_buff *skb)
{
- struct sk_buff **pp = NULL;
+ struct sk_buff *pp = NULL;
struct sk_buff *p;
const struct gre_base_hdr *greh;
unsigned int hlen, grehlen;
@@ -182,7 +182,7 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head,
null_compute_pseudo);
}
- for (p = *head; p; p = p->next) {
+ list_for_each_entry(p, head, list) {
const struct gre_base_hdr *greh2;
if (!NAPI_GRO_CB(p)->same_flow)
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 1617604c9284..695979b7ef6d 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -429,14 +429,11 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
icmp_param->data.icmph.checksum = 0;
+ ipcm_init(&ipc);
inet->tos = ip_hdr(skb)->tos;
sk->sk_mark = mark;
daddr = ipc.addr = ip_hdr(skb)->saddr;
saddr = fib_compute_spec_dst(skb);
- ipc.opt = NULL;
- ipc.tx_flags = 0;
- ipc.ttl = 0;
- ipc.tos = -1;
if (icmp_param->replyopts.opt.opt.optlen) {
ipc.opt = &icmp_param->replyopts.opt;
@@ -710,11 +707,9 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
icmp_param.offset = skb_network_offset(skb_in);
inet_sk(sk)->tos = tos;
sk->sk_mark = mark;
+ ipcm_init(&ipc);
ipc.addr = iph->saddr;
ipc.opt = &icmp_param.replyopts.opt;
- ipc.tx_flags = 0;
- ipc.ttl = 0;
- ipc.tos = -1;
rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, mark,
type, code, &icmp_param);
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 1e4cf3ab560f..d3162baca9f1 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -20,6 +20,7 @@
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <linux/slab.h>
+#include <linux/rhashtable.h>
#include <net/sock.h>
#include <net/inet_frag.h>
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 2d8efeecf619..c8ca5d8f0f75 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -587,6 +587,8 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev,
goto err_free_skb;
key = &tun_info->key;
+ if (!(tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT))
+ goto err_free_rt;
md = ip_tunnel_info_opts(tun_info);
if (!md)
goto err_free_rt;
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 7582713dd18f..3196cf58f418 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -307,7 +307,8 @@ drop:
return true;
}
-static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+static int ip_rcv_finish_core(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
{
const struct iphdr *iph = ip_hdr(skb);
int (*edemux)(struct sk_buff *skb);
@@ -315,13 +316,6 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
struct rtable *rt;
int err;
- /* if ingress device is enslaved to an L3 master device pass the
- * skb to its handler for processing
- */
- skb = l3mdev_ip_rcv(skb);
- if (!skb)
- return NET_RX_SUCCESS;
-
if (net->ipv4.sysctl_ip_early_demux &&
!skb_dst(skb) &&
!skb->sk &&
@@ -393,7 +387,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
goto drop;
}
- return dst_input(skb);
+ return NET_RX_SUCCESS;
drop:
kfree_skb(skb);
@@ -405,13 +399,29 @@ drop_error:
goto drop;
}
+static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ int ret;
+
+ /* if ingress device is enslaved to an L3 master device pass the
+ * skb to its handler for processing
+ */
+ skb = l3mdev_ip_rcv(skb);
+ if (!skb)
+ return NET_RX_SUCCESS;
+
+ ret = ip_rcv_finish_core(net, sk, skb);
+ if (ret != NET_RX_DROP)
+ ret = dst_input(skb);
+ return ret;
+}
+
/*
* Main IP Receive routine.
*/
-int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
+static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
{
const struct iphdr *iph;
- struct net *net;
u32 len;
/* When the interface is in promisc. mode, drop all the crap
@@ -421,7 +431,6 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
goto drop;
- net = dev_net(dev);
__IP_UPD_PO_STATS(net, IPSTATS_MIB_IN, skb->len);
skb = skb_share_check(skb, GFP_ATOMIC);
@@ -489,9 +498,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
/* Must drop socket now because of tproxy. */
skb_orphan(skb);
- return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING,
- net, NULL, skb, dev, NULL,
- ip_rcv_finish);
+ return skb;
csum_error:
__IP_INC_STATS(net, IPSTATS_MIB_CSUMERRORS);
@@ -500,5 +507,113 @@ inhdr_error:
drop:
kfree_skb(skb);
out:
- return NET_RX_DROP;
+ return NULL;
+}
+
+/*
+ * IP receive entry point
+ */
+int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
+ struct net_device *orig_dev)
+{
+ struct net *net = dev_net(dev);
+
+ skb = ip_rcv_core(skb, net);
+ if (skb == NULL)
+ return NET_RX_DROP;
+ return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING,
+ net, NULL, skb, dev, NULL,
+ ip_rcv_finish);
+}
+
+static void ip_sublist_rcv_finish(struct list_head *head)
+{
+ struct sk_buff *skb, *next;
+
+ list_for_each_entry_safe(skb, next, head, list) {
+ list_del(&skb->list);
+ /* Handle ip{6}_forward case, as sch_direct_xmit have
+ * another kind of SKB-list usage (see validate_xmit_skb_list)
+ */
+ skb->next = NULL;
+ dst_input(skb);
+ }
+}
+
+static void ip_list_rcv_finish(struct net *net, struct sock *sk,
+ struct list_head *head)
+{
+ struct dst_entry *curr_dst = NULL;
+ struct sk_buff *skb, *next;
+ struct list_head sublist;
+
+ INIT_LIST_HEAD(&sublist);
+ list_for_each_entry_safe(skb, next, head, list) {
+ struct dst_entry *dst;
+
+ list_del(&skb->list);
+ /* if ingress device is enslaved to an L3 master device pass the
+ * skb to its handler for processing
+ */
+ skb = l3mdev_ip_rcv(skb);
+ if (!skb)
+ continue;
+ if (ip_rcv_finish_core(net, sk, skb) == NET_RX_DROP)
+ continue;
+
+ dst = skb_dst(skb);
+ if (curr_dst != dst) {
+ /* dispatch old sublist */
+ if (!list_empty(&sublist))
+ ip_sublist_rcv_finish(&sublist);
+ /* start new sublist */
+ INIT_LIST_HEAD(&sublist);
+ curr_dst = dst;
+ }
+ list_add_tail(&skb->list, &sublist);
+ }
+ /* dispatch final sublist */
+ ip_sublist_rcv_finish(&sublist);
+}
+
+static void ip_sublist_rcv(struct list_head *head, struct net_device *dev,
+ struct net *net)
+{
+ NF_HOOK_LIST(NFPROTO_IPV4, NF_INET_PRE_ROUTING, net, NULL,
+ head, dev, NULL, ip_rcv_finish);
+ ip_list_rcv_finish(net, NULL, head);
+}
+
+/* Receive a list of IP packets */
+void ip_list_rcv(struct list_head *head, struct packet_type *pt,
+ struct net_device *orig_dev)
+{
+ struct net_device *curr_dev = NULL;
+ struct net *curr_net = NULL;
+ struct sk_buff *skb, *next;
+ struct list_head sublist;
+
+ INIT_LIST_HEAD(&sublist);
+ list_for_each_entry_safe(skb, next, head, list) {
+ struct net_device *dev = skb->dev;
+ struct net *net = dev_net(dev);
+
+ list_del(&skb->list);
+ skb = ip_rcv_core(skb, net);
+ if (skb == NULL)
+ continue;
+
+ if (curr_dev != dev || curr_net != net) {
+ /* dispatch old sublist */
+ if (!list_empty(&sublist))
+ ip_sublist_rcv(&sublist, curr_dev, curr_net);
+ /* start new sublist */
+ INIT_LIST_HEAD(&sublist);
+ curr_dev = dev;
+ curr_net = net;
+ }
+ list_add_tail(&skb->list, &sublist);
+ }
+ /* dispatch final sublist */
+ ip_sublist_rcv(&sublist, curr_dev, curr_net);
}
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index b3308e9d9762..e2b6bd478afb 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -423,7 +423,8 @@ static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4)
}
/* Note: skb->sk can be different from sk, in case of tunnels */
-int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
+int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
+ __u8 tos)
{
struct inet_sock *inet = inet_sk(sk);
struct net *net = sock_net(sk);
@@ -462,7 +463,7 @@ int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
inet->inet_dport,
inet->inet_sport,
sk->sk_protocol,
- RT_CONN_FLAGS(sk),
+ RT_CONN_FLAGS_TOS(sk, tos),
sk->sk_bound_dev_if);
if (IS_ERR(rt))
goto no_route;
@@ -478,7 +479,7 @@ packet_routed:
skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
skb_reset_network_header(skb);
iph = ip_hdr(skb);
- *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
+ *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (tos & 0xff));
if (ip_dont_fragment(sk, &rt->dst) && !skb->ignore_df)
iph->frag_off = htons(IP_DF);
else
@@ -511,7 +512,7 @@ no_route:
kfree_skb(skb);
return -EHOSTUNREACH;
}
-EXPORT_SYMBOL(ip_queue_xmit);
+EXPORT_SYMBOL(__ip_queue_xmit);
static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
{
@@ -1145,14 +1146,15 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
cork->fragsize = ip_sk_use_pmtu(sk) ?
dst_mtu(&rt->dst) : rt->dst.dev->mtu;
- cork->gso_size = sk->sk_type == SOCK_DGRAM &&
- sk->sk_protocol == IPPROTO_UDP ? ipc->gso_size : 0;
+ cork->gso_size = ipc->gso_size;
cork->dst = &rt->dst;
cork->length = 0;
cork->ttl = ipc->ttl;
cork->tos = ipc->tos;
cork->priority = ipc->priority;
- cork->tx_flags = ipc->tx_flags;
+ cork->transmit_time = ipc->sockc.transmit_time;
+ cork->tx_flags = 0;
+ sock_tx_timestamp(sk, ipc->sockc.tsflags, &cork->tx_flags);
return 0;
}
@@ -1413,6 +1415,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
skb->priority = (cork->tos != -1) ? cork->priority: sk->sk_priority;
skb->mark = sk->sk_mark;
+ skb->tstamp = cork->transmit_time;
/*
* Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
* on dst refcount
@@ -1545,11 +1548,8 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
if (__ip_options_echo(net, &replyopts.opt.opt, skb, sopt))
return;
+ ipcm_init(&ipc);
ipc.addr = daddr;
- ipc.opt = NULL;
- ipc.tx_flags = 0;
- ipc.ttl = 0;
- ipc.tos = -1;
if (replyopts.opt.opt.optlen) {
ipc.opt = &replyopts.opt;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 9f79b9803a16..5660adcf7a04 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -60,6 +60,7 @@
#include <linux/netfilter_ipv4.h>
#include <linux/compat.h>
#include <linux/export.h>
+#include <linux/rhashtable.h>
#include <net/ip_tunnels.h>
#include <net/checksum.h>
#include <net/netlink.h>
@@ -1051,7 +1052,7 @@ static int ipmr_cache_report(struct mr_table *mrt,
struct sk_buff *skb;
int ret;
- if (assert == IGMPMSG_WHOLEPKT)
+ if (assert == IGMPMSG_WHOLEPKT || assert == IGMPMSG_WRVIFWHOLE)
skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
else
skb = alloc_skb(128, GFP_ATOMIC);
@@ -1059,7 +1060,7 @@ static int ipmr_cache_report(struct mr_table *mrt,
if (!skb)
return -ENOBUFS;
- if (assert == IGMPMSG_WHOLEPKT) {
+ if (assert == IGMPMSG_WHOLEPKT || assert == IGMPMSG_WRVIFWHOLE) {
/* Ugly, but we have no choice with this interface.
* Duplicate old header, fix ihl, length etc.
* And all this only to mangle msg->im_msgtype and
@@ -1070,9 +1071,12 @@ static int ipmr_cache_report(struct mr_table *mrt,
skb_reset_transport_header(skb);
msg = (struct igmpmsg *)skb_network_header(skb);
memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr));
- msg->im_msgtype = IGMPMSG_WHOLEPKT;
+ msg->im_msgtype = assert;
msg->im_mbz = 0;
- msg->im_vif = mrt->mroute_reg_vif_num;
+ if (assert == IGMPMSG_WRVIFWHOLE)
+ msg->im_vif = vifi;
+ else
+ msg->im_vif = mrt->mroute_reg_vif_num;
ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2;
ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) +
sizeof(struct iphdr));
@@ -1371,6 +1375,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval,
struct mr_table *mrt;
struct vifctl vif;
struct mfcctl mfc;
+ bool do_wrvifwhole;
u32 uval;
/* There's one exception to the lock - MRT_DONE which needs to unlock */
@@ -1501,10 +1506,12 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval,
break;
}
+ do_wrvifwhole = (val == IGMPMSG_WRVIFWHOLE);
val = !!val;
if (val != mrt->mroute_do_pim) {
mrt->mroute_do_pim = val;
mrt->mroute_do_assert = val;
+ mrt->mroute_do_wrvifwhole = do_wrvifwhole;
}
break;
case MRT_TABLE:
@@ -1982,6 +1989,9 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt,
MFC_ASSERT_THRESH)) {
c->_c.mfc_un.res.last_assert = jiffies;
ipmr_cache_report(mrt, skb, true_vifi, IGMPMSG_WRONGVIF);
+ if (mrt->mroute_do_wrvifwhole)
+ ipmr_cache_report(mrt, skb, true_vifi,
+ IGMPMSG_WRVIFWHOLE);
}
goto dont_forward;
}
@@ -2658,7 +2668,9 @@ static bool ipmr_fill_table(struct mr_table *mrt, struct sk_buff *skb)
mrt->mroute_reg_vif_num) ||
nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_ASSERT,
mrt->mroute_do_assert) ||
- nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_PIM, mrt->mroute_do_pim))
+ nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_PIM, mrt->mroute_do_pim) ||
+ nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_WRVIFWHOLE,
+ mrt->mroute_do_wrvifwhole))
return false;
return true;
diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c
index cafb0506c8c9..1ad9aa62a97b 100644
--- a/net/ipv4/ipmr_base.c
+++ b/net/ipv4/ipmr_base.c
@@ -2,6 +2,7 @@
* Common logic shared by IPv4 [ipmr] and IPv6 [ip6mr] implementation
*/
+#include <linux/rhashtable.h>
#include <linux/mroute_base.h>
/* Sets everything common except 'dev', since that is done under locking */
diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c
index 4388de0e5380..1e6f28c97d3a 100644
--- a/net/ipv4/netfilter/nf_log_ipv4.c
+++ b/net/ipv4/netfilter/nf_log_ipv4.c
@@ -35,7 +35,7 @@ static const struct nf_loginfo default_loginfo = {
};
/* One level of recursion won't kill us */
-static void dump_ipv4_packet(struct nf_log_buf *m,
+static void dump_ipv4_packet(struct net *net, struct nf_log_buf *m,
const struct nf_loginfo *info,
const struct sk_buff *skb, unsigned int iphoff)
{
@@ -183,7 +183,7 @@ static void dump_ipv4_packet(struct nf_log_buf *m,
/* Max length: 3+maxlen */
if (!iphoff) { /* Only recurse once. */
nf_log_buf_add(m, "[");
- dump_ipv4_packet(m, info, skb,
+ dump_ipv4_packet(net, m, info, skb,
iphoff + ih->ihl*4+sizeof(_icmph));
nf_log_buf_add(m, "] ");
}
@@ -251,7 +251,7 @@ static void dump_ipv4_packet(struct nf_log_buf *m,
/* Max length: 15 "UID=4294967295 " */
if ((logflags & NF_LOG_UID) && !iphoff)
- nf_log_dump_sk_uid_gid(m, skb->sk);
+ nf_log_dump_sk_uid_gid(net, m, skb->sk);
/* Max length: 16 "MARK=0xFFFFFFFF " */
if (!iphoff && skb->mark)
@@ -333,7 +333,7 @@ static void nf_log_ip_packet(struct net *net, u_int8_t pf,
if (in != NULL)
dump_ipv4_mac_header(m, loginfo, skb);
- dump_ipv4_packet(m, loginfo, skb, 0);
+ dump_ipv4_packet(net, m, loginfo, skb, 0);
nf_log_buf_close(m);
}
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 2ed64bca54e3..b54c964ad925 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -739,13 +739,7 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
/* no remote port */
}
- ipc.sockc.tsflags = sk->sk_tsflags;
- ipc.addr = inet->inet_saddr;
- ipc.opt = NULL;
- ipc.oif = sk->sk_bound_dev_if;
- ipc.tx_flags = 0;
- ipc.ttl = 0;
- ipc.tos = -1;
+ ipcm_init_sk(&ipc, inet);
if (msg->msg_controllen) {
err = ip_cmsg_send(sk, msg, &ipc, false);
@@ -769,8 +763,6 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
rcu_read_unlock();
}
- sock_tx_timestamp(sk, ipc.sockc.tsflags, &ipc.tx_flags);
-
saddr = ipc.addr;
ipc.addr = faddr = daddr;
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 77350c1256ce..b46e4cf9a55a 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -287,6 +287,8 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPDelivered", LINUX_MIB_TCPDELIVERED),
SNMP_MIB_ITEM("TCPDeliveredCE", LINUX_MIB_TCPDELIVEREDCE),
SNMP_MIB_ITEM("TCPAckCompressed", LINUX_MIB_TCPACKCOMPRESSED),
+ SNMP_MIB_ITEM("TCPZeroWindowDrop", LINUX_MIB_TCPZEROWINDOWDROP),
+ SNMP_MIB_ITEM("TCPRcvQDrop", LINUX_MIB_TCPRCVQDROP),
SNMP_MIB_SENTINEL
};
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index abb3c9490c55..33df4d76db2d 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -381,6 +381,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
skb->priority = sk->sk_priority;
skb->mark = sk->sk_mark;
+ skb->tstamp = sockc->transmit_time;
skb_dst_set(skb, &rt->dst);
*rtp = NULL;
@@ -561,13 +562,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
daddr = inet->inet_daddr;
}
- ipc.sockc.tsflags = sk->sk_tsflags;
- ipc.addr = inet->inet_saddr;
- ipc.opt = NULL;
- ipc.tx_flags = 0;
- ipc.ttl = 0;
- ipc.tos = -1;
- ipc.oif = sk->sk_bound_dev_if;
+ ipcm_init_sk(&ipc, inet);
if (msg->msg_controllen) {
err = ip_cmsg_send(sk, msg, &ipc, false);
@@ -670,8 +665,6 @@ back_from_confirm:
&rt, msg->msg_flags, &ipc.sockc);
else {
- sock_tx_timestamp(sk, ipc.sockc.tsflags, &ipc.tx_flags);
-
if (!ipc.addr)
ipc.addr = fl4.daddr;
lock_sock(sk);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 4491faf83f4f..bce53b1728a6 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -817,8 +817,7 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
* This occurs when user tries to read
* from never connected socket.
*/
- if (!sock_flag(sk, SOCK_DONE))
- ret = -ENOTCONN;
+ ret = -ENOTCONN;
break;
}
if (!timeo) {
@@ -1241,7 +1240,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
/* 'common' sending to sendq */
}
- sockc.tsflags = sk->sk_tsflags;
+ sockcm_init(&sockc, sk);
if (msg->msg_controllen) {
err = sock_cmsg_send(sk, msg, &sockc);
if (unlikely(err)) {
@@ -1275,9 +1274,6 @@ restart:
int linear;
new_segment:
- /* Allocate new segment. If the interface is SG,
- * allocate skb fitting to single page.
- */
if (!sk_stream_memory_free(sk))
goto wait_for_sndbuf;
@@ -2042,13 +2038,10 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
break;
if (sk->sk_state == TCP_CLOSE) {
- if (!sock_flag(sk, SOCK_DONE)) {
- /* This occurs when user tries to read
- * from never connected socket.
- */
- copied = -ENOTCONN;
- break;
- }
+ /* This occurs when user tries to read
+ * from never connected socket.
+ */
+ copied = -ENOTCONN;
break;
}
@@ -2576,6 +2569,7 @@ int tcp_disconnect(struct sock *sk, int flags)
sk->sk_shutdown = 0;
sock_reset_flag(sk, SOCK_DONE);
tp->srtt_us = 0;
+ tp->rcv_rtt_last_tsecr = 0;
tp->write_seq += tp->max_window + 2;
if (tp->write_seq == 0)
tp->write_seq = 1;
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index 58e2f479ffb4..3b5f45b9e81e 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -205,7 +205,11 @@ static u32 bbr_bw(const struct sock *sk)
*/
static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain)
{
- rate *= tcp_mss_to_mtu(sk, tcp_sk(sk)->mss_cache);
+ unsigned int mss = tcp_sk(sk)->mss_cache;
+
+ if (!tcp_needs_internal_pacing(sk))
+ mss = tcp_mss_to_mtu(sk, mss);
+ rate *= mss;
rate *= gain;
rate >>= BBR_SCALE;
rate *= USEC_PER_SEC;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 8e5522c6833a..91dbb9afb950 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -78,6 +78,7 @@
#include <linux/errqueue.h>
#include <trace/events/tcp.h>
#include <linux/static_key.h>
+#include <net/busy_poll.h>
int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
@@ -582,9 +583,12 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
{
struct tcp_sock *tp = tcp_sk(sk);
- if (tp->rx_opt.rcv_tsecr &&
- (TCP_SKB_CB(skb)->end_seq -
- TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss)) {
+ if (tp->rx_opt.rcv_tsecr == tp->rcv_rtt_last_tsecr)
+ return;
+ tp->rcv_rtt_last_tsecr = tp->rx_opt.rcv_tsecr;
+
+ if (TCP_SKB_CB(skb)->end_seq -
+ TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss) {
u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr;
u32 delta_us;
@@ -3458,7 +3462,7 @@ static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb)
static void tcp_store_ts_recent(struct tcp_sock *tp)
{
tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval;
- tp->rx_opt.ts_recent_stamp = get_seconds();
+ tp->rx_opt.ts_recent_stamp = ktime_get_seconds();
}
static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
@@ -4339,6 +4343,11 @@ static bool tcp_try_coalesce(struct sock *sk,
if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq)
return false;
+#ifdef CONFIG_TLS_DEVICE
+ if (from->decrypted != to->decrypted)
+ return false;
+#endif
+
if (!skb_try_coalesce(to, from, fragstolen, &delta))
return false;
@@ -4617,8 +4626,10 @@ int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
skb->data_len = data_len;
skb->len = size;
- if (tcp_try_rmem_schedule(sk, skb, skb->truesize))
+ if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP);
goto err_free;
+ }
err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
if (err)
@@ -4674,18 +4685,21 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
* Out of sequence packets to the out_of_order_queue.
*/
if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
- if (tcp_receive_window(tp) == 0)
+ if (tcp_receive_window(tp) == 0) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP);
goto out_of_window;
+ }
/* Ok. In sequence. In window. */
queue_and_out:
if (skb_queue_len(&sk->sk_receive_queue) == 0)
sk_forced_mem_schedule(sk, skb->truesize);
- else if (tcp_try_rmem_schedule(sk, skb, skb->truesize))
+ else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP);
goto drop;
+ }
eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
- tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
if (skb->len)
tcp_event_data_recv(sk, skb);
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
@@ -4741,8 +4755,10 @@ drop:
/* If window is closed, drop tail of packet. But after
* remembering D-SACK for its head made in previous line.
*/
- if (!tcp_receive_window(tp))
+ if (!tcp_receive_window(tp)) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP);
goto out_of_window;
+ }
goto queue_and_out;
}
@@ -4860,6 +4876,9 @@ restart:
break;
memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
+#ifdef CONFIG_TLS_DEVICE
+ nskb->decrypted = skb->decrypted;
+#endif
TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
if (list)
__skb_queue_before(list, skb, nskb);
@@ -4887,6 +4906,10 @@ restart:
skb == tail ||
(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
goto end;
+#ifdef CONFIG_TLS_DEVICE
+ if (skb->decrypted != nskb->decrypted)
+ goto end;
+#endif
}
}
}
@@ -5484,6 +5507,11 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
tcp_ack(sk, skb, 0);
__kfree_skb(skb);
tcp_data_snd_check(sk);
+ /* When receiving pure ack in fast path, update
+ * last ts ecr directly instead of calling
+ * tcp_rcv_rtt_measure_ts()
+ */
+ tp->rcv_rtt_last_tsecr = tp->rx_opt.rcv_tsecr;
return;
} else { /* Header too small */
TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
@@ -5585,6 +5613,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
if (skb) {
icsk->icsk_af_ops->sk_rx_dst_set(sk, skb);
security_inet_conn_established(sk, skb);
+ sk_mark_napi_id(sk, skb);
}
tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB);
@@ -6413,6 +6442,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
tcp_rsk(req)->snt_isn = isn;
tcp_rsk(req)->txhash = net_tx_rndhash();
tcp_openreq_init_rwin(req, sk, dst);
+ sk_rx_queue_set(req_to_sk(req), skb);
if (!want_cookie) {
tcp_reqsk_record_syn(sk, req, skb);
fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 3b2711e33e4c..9e041fa5c545 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -155,7 +155,8 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
and use initial timestamp retrieved from peer table.
*/
if (tcptw->tw_ts_recent_stamp &&
- (!twp || (reuse && get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
+ (!twp || (reuse && time_after32(ktime_get_seconds(),
+ tcptw->tw_ts_recent_stamp)))) {
/* In case of repair and re-using TIME-WAIT sockets we still
* want to be sure that it is safe as above but honor the
* sequence numbers and time stamps set as part of the repair
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 1dda1341a223..75ef332a7caf 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -144,7 +144,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
tw->tw_substate = TCP_TIME_WAIT;
tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq;
if (tmp_opt.saw_tstamp) {
- tcptw->tw_ts_recent_stamp = get_seconds();
+ tcptw->tw_ts_recent_stamp = ktime_get_seconds();
tcptw->tw_ts_recent = tmp_opt.rcv_tsval;
}
@@ -189,7 +189,7 @@ kill:
if (tmp_opt.saw_tstamp) {
tcptw->tw_ts_recent = tmp_opt.rcv_tsval;
- tcptw->tw_ts_recent_stamp = get_seconds();
+ tcptw->tw_ts_recent_stamp = ktime_get_seconds();
}
inet_twsk_put(tw);
@@ -449,119 +449,122 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
struct sk_buff *skb)
{
struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC);
+ const struct inet_request_sock *ireq = inet_rsk(req);
+ struct tcp_request_sock *treq = tcp_rsk(req);
+ struct inet_connection_sock *newicsk;
+ struct tcp_sock *oldtp, *newtp;
- if (newsk) {
- const struct inet_request_sock *ireq = inet_rsk(req);
- struct tcp_request_sock *treq = tcp_rsk(req);
- struct inet_connection_sock *newicsk = inet_csk(newsk);
- struct tcp_sock *newtp = tcp_sk(newsk);
- struct tcp_sock *oldtp = tcp_sk(sk);
-
- smc_check_reset_syn_req(oldtp, req, newtp);
-
- /* Now setup tcp_sock */
- newtp->pred_flags = 0;
-
- newtp->rcv_wup = newtp->copied_seq =
- newtp->rcv_nxt = treq->rcv_isn + 1;
- newtp->segs_in = 1;
-
- newtp->snd_sml = newtp->snd_una =
- newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1;
-
- INIT_LIST_HEAD(&newtp->tsq_node);
- INIT_LIST_HEAD(&newtp->tsorted_sent_queue);
-
- tcp_init_wl(newtp, treq->rcv_isn);
-
- newtp->srtt_us = 0;
- newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
- minmax_reset(&newtp->rtt_min, tcp_jiffies32, ~0U);
- newicsk->icsk_rto = TCP_TIMEOUT_INIT;
- newicsk->icsk_ack.lrcvtime = tcp_jiffies32;
-
- newtp->packets_out = 0;
- newtp->retrans_out = 0;
- newtp->sacked_out = 0;
- newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
- newtp->tlp_high_seq = 0;
- newtp->lsndtime = tcp_jiffies32;
- newsk->sk_txhash = treq->txhash;
- newtp->last_oow_ack_time = 0;
- newtp->total_retrans = req->num_retrans;
-
- /* So many TCP implementations out there (incorrectly) count the
- * initial SYN frame in their delayed-ACK and congestion control
- * algorithms that we must have the following bandaid to talk
- * efficiently to them. -DaveM
- */
- newtp->snd_cwnd = TCP_INIT_CWND;
- newtp->snd_cwnd_cnt = 0;
-
- /* There's a bubble in the pipe until at least the first ACK. */
- newtp->app_limited = ~0U;
-
- tcp_init_xmit_timers(newsk);
- newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1;
-
- newtp->rx_opt.saw_tstamp = 0;
-
- newtp->rx_opt.dsack = 0;
- newtp->rx_opt.num_sacks = 0;
-
- newtp->urg_data = 0;
-
- if (sock_flag(newsk, SOCK_KEEPOPEN))
- inet_csk_reset_keepalive_timer(newsk,
- keepalive_time_when(newtp));
-
- newtp->rx_opt.tstamp_ok = ireq->tstamp_ok;
- newtp->rx_opt.sack_ok = ireq->sack_ok;
- newtp->window_clamp = req->rsk_window_clamp;
- newtp->rcv_ssthresh = req->rsk_rcv_wnd;
- newtp->rcv_wnd = req->rsk_rcv_wnd;
- newtp->rx_opt.wscale_ok = ireq->wscale_ok;
- if (newtp->rx_opt.wscale_ok) {
- newtp->rx_opt.snd_wscale = ireq->snd_wscale;
- newtp->rx_opt.rcv_wscale = ireq->rcv_wscale;
- } else {
- newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0;
- newtp->window_clamp = min(newtp->window_clamp, 65535U);
- }
- newtp->snd_wnd = (ntohs(tcp_hdr(skb)->window) <<
- newtp->rx_opt.snd_wscale);
- newtp->max_window = newtp->snd_wnd;
-
- if (newtp->rx_opt.tstamp_ok) {
- newtp->rx_opt.ts_recent = req->ts_recent;
- newtp->rx_opt.ts_recent_stamp = get_seconds();
- newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
- } else {
- newtp->rx_opt.ts_recent_stamp = 0;
- newtp->tcp_header_len = sizeof(struct tcphdr);
- }
- newtp->tsoffset = treq->ts_off;
+ if (!newsk)
+ return NULL;
+
+ newicsk = inet_csk(newsk);
+ newtp = tcp_sk(newsk);
+ oldtp = tcp_sk(sk);
+
+ smc_check_reset_syn_req(oldtp, req, newtp);
+
+ /* Now setup tcp_sock */
+ newtp->pred_flags = 0;
+
+ newtp->rcv_wup = newtp->copied_seq =
+ newtp->rcv_nxt = treq->rcv_isn + 1;
+ newtp->segs_in = 1;
+
+ newtp->snd_sml = newtp->snd_una =
+ newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1;
+
+ INIT_LIST_HEAD(&newtp->tsq_node);
+ INIT_LIST_HEAD(&newtp->tsorted_sent_queue);
+
+ tcp_init_wl(newtp, treq->rcv_isn);
+
+ newtp->srtt_us = 0;
+ newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
+ minmax_reset(&newtp->rtt_min, tcp_jiffies32, ~0U);
+ newicsk->icsk_rto = TCP_TIMEOUT_INIT;
+ newicsk->icsk_ack.lrcvtime = tcp_jiffies32;
+
+ newtp->packets_out = 0;
+ newtp->retrans_out = 0;
+ newtp->sacked_out = 0;
+ newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
+ newtp->tlp_high_seq = 0;
+ newtp->lsndtime = tcp_jiffies32;
+ newsk->sk_txhash = treq->txhash;
+ newtp->last_oow_ack_time = 0;
+ newtp->total_retrans = req->num_retrans;
+
+ /* So many TCP implementations out there (incorrectly) count the
+ * initial SYN frame in their delayed-ACK and congestion control
+ * algorithms that we must have the following bandaid to talk
+ * efficiently to them. -DaveM
+ */
+ newtp->snd_cwnd = TCP_INIT_CWND;
+ newtp->snd_cwnd_cnt = 0;
+
+ /* There's a bubble in the pipe until at least the first ACK. */
+ newtp->app_limited = ~0U;
+
+ tcp_init_xmit_timers(newsk);
+ newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1;
+
+ newtp->rx_opt.saw_tstamp = 0;
+
+ newtp->rx_opt.dsack = 0;
+ newtp->rx_opt.num_sacks = 0;
+
+ newtp->urg_data = 0;
+
+ if (sock_flag(newsk, SOCK_KEEPOPEN))
+ inet_csk_reset_keepalive_timer(newsk,
+ keepalive_time_when(newtp));
+
+ newtp->rx_opt.tstamp_ok = ireq->tstamp_ok;
+ newtp->rx_opt.sack_ok = ireq->sack_ok;
+ newtp->window_clamp = req->rsk_window_clamp;
+ newtp->rcv_ssthresh = req->rsk_rcv_wnd;
+ newtp->rcv_wnd = req->rsk_rcv_wnd;
+ newtp->rx_opt.wscale_ok = ireq->wscale_ok;
+ if (newtp->rx_opt.wscale_ok) {
+ newtp->rx_opt.snd_wscale = ireq->snd_wscale;
+ newtp->rx_opt.rcv_wscale = ireq->rcv_wscale;
+ } else {
+ newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0;
+ newtp->window_clamp = min(newtp->window_clamp, 65535U);
+ }
+ newtp->snd_wnd = ntohs(tcp_hdr(skb)->window) << newtp->rx_opt.snd_wscale;
+ newtp->max_window = newtp->snd_wnd;
+
+ if (newtp->rx_opt.tstamp_ok) {
+ newtp->rx_opt.ts_recent = req->ts_recent;
+ newtp->rx_opt.ts_recent_stamp = ktime_get_seconds();
+ newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
+ } else {
+ newtp->rx_opt.ts_recent_stamp = 0;
+ newtp->tcp_header_len = sizeof(struct tcphdr);
+ }
+ newtp->tsoffset = treq->ts_off;
#ifdef CONFIG_TCP_MD5SIG
- newtp->md5sig_info = NULL; /*XXX*/
- if (newtp->af_specific->md5_lookup(sk, newsk))
- newtp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
+ newtp->md5sig_info = NULL; /*XXX*/
+ if (newtp->af_specific->md5_lookup(sk, newsk))
+ newtp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
#endif
- if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len)
- newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
- newtp->rx_opt.mss_clamp = req->mss;
- tcp_ecn_openreq_child(newtp, req);
- newtp->fastopen_req = NULL;
- newtp->fastopen_rsk = NULL;
- newtp->syn_data_acked = 0;
- newtp->rack.mstamp = 0;
- newtp->rack.advanced = 0;
- newtp->rack.reo_wnd_steps = 1;
- newtp->rack.last_delivered = 0;
- newtp->rack.reo_wnd_persist = 0;
- newtp->rack.dsack_seen = 0;
-
- __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS);
- }
+ if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len)
+ newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
+ newtp->rx_opt.mss_clamp = req->mss;
+ tcp_ecn_openreq_child(newtp, req);
+ newtp->fastopen_req = NULL;
+ newtp->fastopen_rsk = NULL;
+ newtp->syn_data_acked = 0;
+ newtp->rack.mstamp = 0;
+ newtp->rack.advanced = 0;
+ newtp->rack.reo_wnd_steps = 1;
+ newtp->rack.last_delivered = 0;
+ newtp->rack.reo_wnd_persist = 0;
+ newtp->rack.dsack_seen = 0;
+
+ __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS);
+
return newsk;
}
EXPORT_SYMBOL(tcp_create_openreq_child);
@@ -600,7 +603,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
* it can be estimated (approximately)
* from another data.
*/
- tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->num_timeout);
+ tmp_opt.ts_recent_stamp = ktime_get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->num_timeout);
paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
}
}
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 8cc7c3487330..870b0a335061 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -180,9 +180,9 @@ out:
return segs;
}
-struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
+struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb)
{
- struct sk_buff **pp = NULL;
+ struct sk_buff *pp = NULL;
struct sk_buff *p;
struct tcphdr *th;
struct tcphdr *th2;
@@ -220,7 +220,7 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
len = skb_gro_len(skb);
flags = tcp_flag_word(th);
- for (; (p = *head); head = &p->next) {
+ list_for_each_entry(p, head, list) {
if (!NAPI_GRO_CB(p)->same_flow)
continue;
@@ -233,7 +233,7 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
goto found;
}
-
+ p = NULL;
goto out_check_final;
found:
@@ -262,8 +262,11 @@ found:
flush |= (len - 1) >= mss;
flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq);
+#ifdef CONFIG_TLS_DEVICE
+ flush |= p->decrypted ^ skb->decrypted;
+#endif
- if (flush || skb_gro_receive(head, skb)) {
+ if (flush || skb_gro_receive(p, skb)) {
mss = 1;
goto out_check_final;
}
@@ -277,7 +280,7 @@ out_check_final:
TCP_FLAG_FIN));
if (p && (!NAPI_GRO_CB(skb)->same_flow || flush))
- pp = head;
+ pp = p;
out:
NAPI_GRO_CB(skb)->flush |= (flush != 0);
@@ -302,7 +305,7 @@ int tcp_gro_complete(struct sk_buff *skb)
}
EXPORT_SYMBOL(tcp_gro_complete);
-static struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
+static struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb)
{
/* Don't bother verifying checksum if we're going to flush anyway. */
if (!NAPI_GRO_CB(skb)->flush &&
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 00e5a300ddb9..6cbab56e7407 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -973,17 +973,6 @@ enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
-/* BBR congestion control needs pacing.
- * Same remark for SO_MAX_PACING_RATE.
- * sch_fq packet scheduler is efficiently handling pacing,
- * but is not always installed/used.
- * Return true if TCP stack should pace packets itself.
- */
-static bool tcp_needs_internal_pacing(const struct sock *sk)
-{
- return smp_load_acquire(&sk->sk_pacing_status) == SK_PACING_NEEDED;
-}
-
static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb)
{
u64 len_ns;
@@ -995,9 +984,6 @@ static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb)
if (!rate || rate == ~0U)
return;
- /* Should account for header sizes as sch_fq does,
- * but lets make things simple.
- */
len_ns = (u64)skb->len * NSEC_PER_SEC;
do_div(len_ns, rate);
hrtimer_start(&tcp_sk(sk)->pacing_timer,
diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
index c61240e43923..4dff40dad4dc 100644
--- a/net/ipv4/tcp_rate.c
+++ b/net/ipv4/tcp_rate.c
@@ -146,6 +146,10 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
rs->prior_mstamp); /* ack phase */
rs->interval_us = max(snd_us, ack_us);
+ /* Record both segment send and ack receive intervals */
+ rs->snd_interval_us = snd_us;
+ rs->rcv_interval_us = ack_us;
+
/* Normally we expect interval_us >= min-rtt.
* Note that rate may still be over-estimated when a spuriously
* retransmistted skb was first (s)acked because "interval_us"
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 24e116ddae79..060e841dde40 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -926,11 +926,6 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message compatibility */
return -EOPNOTSUPP;
- ipc.opt = NULL;
- ipc.tx_flags = 0;
- ipc.ttl = 0;
- ipc.tos = -1;
-
getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag;
fl4 = &inet->cork.fl.u.ip4;
@@ -977,9 +972,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
connected = 1;
}
- ipc.sockc.tsflags = sk->sk_tsflags;
- ipc.addr = inet->inet_saddr;
- ipc.oif = sk->sk_bound_dev_if;
+ ipcm_init_sk(&ipc, inet);
ipc.gso_size = up->gso_size;
if (msg->msg_controllen) {
@@ -1027,8 +1020,6 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
saddr = ipc.addr;
ipc.addr = faddr = daddr;
- sock_tx_timestamp(sk, ipc.sockc.tsflags, &ipc.tx_flags);
-
if (ipc.opt && ipc.opt->opt.srr) {
if (!daddr) {
err = -EINVAL;
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 69c54540d5b4..0c0522b79b43 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -343,10 +343,11 @@ out:
return segs;
}
-struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb,
- struct udphdr *uh, udp_lookup_t lookup)
+struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb,
+ struct udphdr *uh, udp_lookup_t lookup)
{
- struct sk_buff *p, **pp = NULL;
+ struct sk_buff *pp = NULL;
+ struct sk_buff *p;
struct udphdr *uh2;
unsigned int off = skb_gro_offset(skb);
int flush = 1;
@@ -371,7 +372,7 @@ struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb,
unflush:
flush = 0;
- for (p = *head; p; p = p->next) {
+ list_for_each_entry(p, head, list) {
if (!NAPI_GRO_CB(p)->same_flow)
continue;
@@ -399,8 +400,8 @@ out:
}
EXPORT_SYMBOL(udp_gro_receive);
-static struct sk_buff **udp4_gro_receive(struct sk_buff **head,
- struct sk_buff *skb)
+static struct sk_buff *udp4_gro_receive(struct list_head *head,
+ struct sk_buff *skb)
{
struct udphdr *uh = udp_gro_udphdr(skb);
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 91580c62bb86..1659a6b3cf42 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -385,8 +385,6 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev)
if (ndev->cnf.stable_secret.initialized)
ndev->cnf.addr_gen_mode = IN6_ADDR_GEN_MODE_STABLE_PRIVACY;
- else
- ndev->cnf.addr_gen_mode = ipv6_devconf_dflt.addr_gen_mode;
ndev->cnf.mtu6 = dev->mtu;
ndev->nd_parms = neigh_parms_alloc(dev, &nd_tbl);
@@ -5210,7 +5208,9 @@ static inline size_t inet6_ifla6_size(void)
+ nla_total_size(DEVCONF_MAX * 4) /* IFLA_INET6_CONF */
+ nla_total_size(IPSTATS_MIB_MAX * 8) /* IFLA_INET6_STATS */
+ nla_total_size(ICMP6_MIB_MAX * 8) /* IFLA_INET6_ICMP6STATS */
- + nla_total_size(sizeof(struct in6_addr)); /* IFLA_INET6_TOKEN */
+ + nla_total_size(sizeof(struct in6_addr)) /* IFLA_INET6_TOKEN */
+ + nla_total_size(1) /* IFLA_INET6_ADDR_GEN_MODE */
+ + 0;
}
static inline size_t inet6_if_nlmsg_size(void)
@@ -5892,32 +5892,31 @@ static int addrconf_sysctl_addr_gen_mode(struct ctl_table *ctl, int write,
loff_t *ppos)
{
int ret = 0;
- int new_val;
+ u32 new_val;
struct inet6_dev *idev = (struct inet6_dev *)ctl->extra1;
struct net *net = (struct net *)ctl->extra2;
+ struct ctl_table tmp = {
+ .data = &new_val,
+ .maxlen = sizeof(new_val),
+ .mode = ctl->mode,
+ };
if (!rtnl_trylock())
return restart_syscall();
- ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
+ new_val = *((u32 *)ctl->data);
- if (write) {
- new_val = *((int *)ctl->data);
+ ret = proc_douintvec(&tmp, write, buffer, lenp, ppos);
+ if (ret != 0)
+ goto out;
+ if (write) {
if (check_addr_gen_mode(new_val) < 0) {
ret = -EINVAL;
goto out;
}
- /* request for default */
- if (&net->ipv6.devconf_dflt->addr_gen_mode == ctl->data) {
- ipv6_devconf_dflt.addr_gen_mode = new_val;
-
- /* request for individual net device */
- } else {
- if (!idev)
- goto out;
-
+ if (idev) {
if (check_stable_privacy(idev, net, new_val) < 0) {
ret = -EINVAL;
goto out;
@@ -5927,7 +5926,21 @@ static int addrconf_sysctl_addr_gen_mode(struct ctl_table *ctl, int write,
idev->cnf.addr_gen_mode = new_val;
addrconf_dev_config(idev->dev);
}
+ } else if (&net->ipv6.devconf_all->addr_gen_mode == ctl->data) {
+ struct net_device *dev;
+
+ net->ipv6.devconf_dflt->addr_gen_mode = new_val;
+ for_each_netdev(net, dev) {
+ idev = __in6_dev_get(dev);
+ if (idev &&
+ idev->cnf.addr_gen_mode != new_val) {
+ idev->cnf.addr_gen_mode = new_val;
+ addrconf_dev_config(idev->dev);
+ }
+ }
}
+
+ *((u32 *)ctl->data) = new_val;
}
out:
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 9ed0eae91758..c9535354149f 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -764,6 +764,7 @@ EXPORT_SYMBOL_GPL(ipv6_opt_accepted);
static struct packet_type ipv6_packet_type __read_mostly = {
.type = cpu_to_be16(ETH_P_IPV6),
.func = ipv6_rcv,
+ .list_func = ipv6_list_rcv,
};
static int __init ipv6_packet_init(void)
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 2ee08b6a86a4..201306b9b5ea 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -736,7 +736,7 @@ EXPORT_SYMBOL_GPL(ip6_datagram_recv_ctl);
int ip6_datagram_send_ctl(struct net *net, struct sock *sk,
struct msghdr *msg, struct flowi6 *fl6,
- struct ipcm6_cookie *ipc6, struct sockcm_cookie *sockc)
+ struct ipcm6_cookie *ipc6)
{
struct in6_pktinfo *src_info;
struct cmsghdr *cmsg;
@@ -755,7 +755,7 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk,
}
if (cmsg->cmsg_level == SOL_SOCKET) {
- err = __sock_cmsg_send(sk, msg, cmsg, sockc);
+ err = __sock_cmsg_send(sk, msg, cmsg, &ipc6->sockc);
if (err)
return err;
continue;
diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c
index 27f59b61f70f..ddfa533a84e5 100644
--- a/net/ipv6/esp6_offload.c
+++ b/net/ipv6/esp6_offload.c
@@ -49,8 +49,8 @@ static __u16 esp6_nexthdr_esp_offset(struct ipv6hdr *ipv6_hdr, int nhlen)
return 0;
}
-static struct sk_buff **esp6_gro_receive(struct sk_buff **head,
- struct sk_buff *skb)
+static struct sk_buff *esp6_gro_receive(struct list_head *head,
+ struct sk_buff *skb)
{
int offset = skb_gro_offset(skb);
struct xfrm_offload *xo;
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index be491bf6ab6e..24611c8b0562 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -430,7 +430,6 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
struct icmp6hdr tmp_hdr;
struct flowi6 fl6;
struct icmpv6_msg msg;
- struct sockcm_cookie sockc_unused = {0};
struct ipcm6_cookie ipc6;
int iif = 0;
int addr_type = 0;
@@ -545,7 +544,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
else if (!fl6.flowi6_oif)
fl6.flowi6_oif = np->ucast_oif;
- ipc6.tclass = np->tclass;
+ ipcm6_init_sk(&ipc6, np);
fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel);
dst = icmpv6_route_lookup(net, skb, sk, &fl6);
@@ -553,8 +552,6 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
goto out;
ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
- ipc6.dontfrag = np->dontfrag;
- ipc6.opt = NULL;
msg.skb = skb;
msg.offset = skb_network_offset(skb);
@@ -575,7 +572,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
len + sizeof(struct icmp6hdr),
sizeof(struct icmp6hdr),
&ipc6, &fl6, (struct rt6_info *)dst,
- MSG_DONTWAIT, &sockc_unused)) {
+ MSG_DONTWAIT)) {
ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTERRORS);
ip6_flush_pending_frames(sk);
} else {
@@ -679,7 +676,6 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
struct dst_entry *dst;
struct ipcm6_cookie ipc6;
u32 mark = IP6_REPLY_MARK(net, skb->mark);
- struct sockcm_cookie sockc_unused = {0};
saddr = &ipv6_hdr(skb)->daddr;
@@ -726,16 +722,14 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
msg.offset = 0;
msg.type = ICMPV6_ECHO_REPLY;
+ ipcm6_init_sk(&ipc6, np);
ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
ipc6.tclass = ipv6_get_dsfield(ipv6_hdr(skb));
- ipc6.dontfrag = np->dontfrag;
- ipc6.opt = NULL;
if (ip6_append_data(sk, icmpv6_getfrag, &msg,
skb->len + sizeof(struct icmp6hdr),
sizeof(struct icmp6hdr), &ipc6, &fl6,
- (struct rt6_info *)dst, MSG_DONTWAIT,
- &sockc_unused)) {
+ (struct rt6_info *)dst, MSG_DONTWAIT)) {
__ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTERRORS);
ip6_flush_pending_frames(sk);
} else {
diff --git a/net/ipv6/ila/Makefile b/net/ipv6/ila/Makefile
index 4b32e5921e5c..b7739aba6e68 100644
--- a/net/ipv6/ila/Makefile
+++ b/net/ipv6/ila/Makefile
@@ -4,4 +4,4 @@
obj-$(CONFIG_IPV6_ILA) += ila.o
-ila-objs := ila_common.o ila_lwt.o ila_xlat.o
+ila-objs := ila_main.o ila_common.o ila_lwt.o ila_xlat.o
diff --git a/net/ipv6/ila/ila.h b/net/ipv6/ila/ila.h
index 3c7a11b62334..1f747bcbec29 100644
--- a/net/ipv6/ila/ila.h
+++ b/net/ipv6/ila/ila.h
@@ -19,6 +19,7 @@
#include <linux/skbuff.h>
#include <linux/types.h>
#include <net/checksum.h>
+#include <net/genetlink.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <uapi/linux/ila.h>
@@ -104,9 +105,31 @@ void ila_update_ipv6_locator(struct sk_buff *skb, struct ila_params *p,
void ila_init_saved_csum(struct ila_params *p);
+struct ila_net {
+ struct {
+ struct rhashtable rhash_table;
+ spinlock_t *locks; /* Bucket locks for entry manipulation */
+ unsigned int locks_mask;
+ bool hooks_registered;
+ } xlat;
+};
+
int ila_lwt_init(void);
void ila_lwt_fini(void);
-int ila_xlat_init(void);
-void ila_xlat_fini(void);
+
+int ila_xlat_init_net(struct net *net);
+void ila_xlat_exit_net(struct net *net);
+
+int ila_xlat_nl_cmd_add_mapping(struct sk_buff *skb, struct genl_info *info);
+int ila_xlat_nl_cmd_del_mapping(struct sk_buff *skb, struct genl_info *info);
+int ila_xlat_nl_cmd_get_mapping(struct sk_buff *skb, struct genl_info *info);
+int ila_xlat_nl_cmd_flush(struct sk_buff *skb, struct genl_info *info);
+int ila_xlat_nl_dump_start(struct netlink_callback *cb);
+int ila_xlat_nl_dump_done(struct netlink_callback *cb);
+int ila_xlat_nl_dump(struct sk_buff *skb, struct netlink_callback *cb);
+
+extern unsigned int ila_net_id;
+
+extern struct genl_family ila_nl_family;
#endif /* __ILA_H */
diff --git a/net/ipv6/ila/ila_common.c b/net/ipv6/ila/ila_common.c
index 8c88ecf29b93..579310466eac 100644
--- a/net/ipv6/ila/ila_common.c
+++ b/net/ipv6/ila/ila_common.c
@@ -154,33 +154,3 @@ void ila_update_ipv6_locator(struct sk_buff *skb, struct ila_params *p,
iaddr->loc = p->locator;
}
-static int __init ila_init(void)
-{
- int ret;
-
- ret = ila_lwt_init();
-
- if (ret)
- goto fail_lwt;
-
- ret = ila_xlat_init();
- if (ret)
- goto fail_xlat;
-
- return 0;
-fail_xlat:
- ila_lwt_fini();
-fail_lwt:
- return ret;
-}
-
-static void __exit ila_fini(void)
-{
- ila_xlat_fini();
- ila_lwt_fini();
-}
-
-module_init(ila_init);
-module_exit(ila_fini);
-MODULE_AUTHOR("Tom Herbert <tom@herbertland.com>");
-MODULE_LICENSE("GPL");
diff --git a/net/ipv6/ila/ila_main.c b/net/ipv6/ila/ila_main.c
new file mode 100644
index 000000000000..18fac76b9520
--- /dev/null
+++ b/net/ipv6/ila/ila_main.c
@@ -0,0 +1,121 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <net/genetlink.h>
+#include <net/ila.h>
+#include <net/netns/generic.h>
+#include <uapi/linux/genetlink.h>
+#include "ila.h"
+
+static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = {
+ [ILA_ATTR_LOCATOR] = { .type = NLA_U64, },
+ [ILA_ATTR_LOCATOR_MATCH] = { .type = NLA_U64, },
+ [ILA_ATTR_IFINDEX] = { .type = NLA_U32, },
+ [ILA_ATTR_CSUM_MODE] = { .type = NLA_U8, },
+ [ILA_ATTR_IDENT_TYPE] = { .type = NLA_U8, },
+};
+
+static const struct genl_ops ila_nl_ops[] = {
+ {
+ .cmd = ILA_CMD_ADD,
+ .doit = ila_xlat_nl_cmd_add_mapping,
+ .policy = ila_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = ILA_CMD_DEL,
+ .doit = ila_xlat_nl_cmd_del_mapping,
+ .policy = ila_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = ILA_CMD_FLUSH,
+ .doit = ila_xlat_nl_cmd_flush,
+ .policy = ila_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = ILA_CMD_GET,
+ .doit = ila_xlat_nl_cmd_get_mapping,
+ .start = ila_xlat_nl_dump_start,
+ .dumpit = ila_xlat_nl_dump,
+ .done = ila_xlat_nl_dump_done,
+ .policy = ila_nl_policy,
+ },
+};
+
+unsigned int ila_net_id;
+
+struct genl_family ila_nl_family __ro_after_init = {
+ .hdrsize = 0,
+ .name = ILA_GENL_NAME,
+ .version = ILA_GENL_VERSION,
+ .maxattr = ILA_ATTR_MAX,
+ .netnsok = true,
+ .parallel_ops = true,
+ .module = THIS_MODULE,
+ .ops = ila_nl_ops,
+ .n_ops = ARRAY_SIZE(ila_nl_ops),
+};
+
+static __net_init int ila_init_net(struct net *net)
+{
+ int err;
+
+ err = ila_xlat_init_net(net);
+ if (err)
+ goto ila_xlat_init_fail;
+
+ return 0;
+
+ila_xlat_init_fail:
+ return err;
+}
+
+static __net_exit void ila_exit_net(struct net *net)
+{
+ ila_xlat_exit_net(net);
+}
+
+static struct pernet_operations ila_net_ops = {
+ .init = ila_init_net,
+ .exit = ila_exit_net,
+ .id = &ila_net_id,
+ .size = sizeof(struct ila_net),
+};
+
+static int __init ila_init(void)
+{
+ int ret;
+
+ ret = register_pernet_device(&ila_net_ops);
+ if (ret)
+ goto register_device_fail;
+
+ ret = genl_register_family(&ila_nl_family);
+ if (ret)
+ goto register_family_fail;
+
+ ret = ila_lwt_init();
+ if (ret)
+ goto fail_lwt;
+
+ return 0;
+
+fail_lwt:
+ genl_unregister_family(&ila_nl_family);
+register_family_fail:
+ unregister_pernet_device(&ila_net_ops);
+register_device_fail:
+ return ret;
+}
+
+static void __exit ila_fini(void)
+{
+ ila_lwt_fini();
+ genl_unregister_family(&ila_nl_family);
+ unregister_pernet_device(&ila_net_ops);
+}
+
+module_init(ila_init);
+module_exit(ila_fini);
+MODULE_AUTHOR("Tom Herbert <tom@herbertland.com>");
+MODULE_LICENSE("GPL");
diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c
index 10ae13560b40..51a15ce50a64 100644
--- a/net/ipv6/ila/ila_xlat.c
+++ b/net/ipv6/ila/ila_xlat.c
@@ -22,36 +22,14 @@ struct ila_map {
struct rcu_head rcu;
};
-static unsigned int ila_net_id;
-
-struct ila_net {
- struct rhashtable rhash_table;
- spinlock_t *locks; /* Bucket locks for entry manipulation */
- unsigned int locks_mask;
- bool hooks_registered;
-};
-
+#define MAX_LOCKS 1024
#define LOCKS_PER_CPU 10
static int alloc_ila_locks(struct ila_net *ilan)
{
- unsigned int i, size;
- unsigned int nr_pcpus = num_possible_cpus();
-
- nr_pcpus = min_t(unsigned int, nr_pcpus, 32UL);
- size = roundup_pow_of_two(nr_pcpus * LOCKS_PER_CPU);
-
- if (sizeof(spinlock_t) != 0) {
- ilan->locks = kvmalloc_array(size, sizeof(spinlock_t),
- GFP_KERNEL);
- if (!ilan->locks)
- return -ENOMEM;
- for (i = 0; i < size; i++)
- spin_lock_init(&ilan->locks[i]);
- }
- ilan->locks_mask = size - 1;
-
- return 0;
+ return alloc_bucket_spinlocks(&ilan->xlat.locks, &ilan->xlat.locks_mask,
+ MAX_LOCKS, LOCKS_PER_CPU,
+ GFP_KERNEL);
}
static u32 hashrnd __read_mostly;
@@ -71,7 +49,7 @@ static inline u32 ila_locator_hash(struct ila_locator loc)
static inline spinlock_t *ila_get_lock(struct ila_net *ilan,
struct ila_locator loc)
{
- return &ilan->locks[ila_locator_hash(loc) & ilan->locks_mask];
+ return &ilan->xlat.locks[ila_locator_hash(loc) & ilan->xlat.locks_mask];
}
static inline int ila_cmp_wildcards(struct ila_map *ila,
@@ -115,16 +93,6 @@ static const struct rhashtable_params rht_params = {
.obj_cmpfn = ila_cmpfn,
};
-static struct genl_family ila_nl_family;
-
-static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = {
- [ILA_ATTR_LOCATOR] = { .type = NLA_U64, },
- [ILA_ATTR_LOCATOR_MATCH] = { .type = NLA_U64, },
- [ILA_ATTR_IFINDEX] = { .type = NLA_U32, },
- [ILA_ATTR_CSUM_MODE] = { .type = NLA_U8, },
- [ILA_ATTR_IDENT_TYPE] = { .type = NLA_U8, },
-};
-
static int parse_nl_config(struct genl_info *info,
struct ila_xlat_params *xp)
{
@@ -162,7 +130,7 @@ static inline struct ila_map *ila_lookup_wildcards(struct ila_addr *iaddr,
{
struct ila_map *ila;
- ila = rhashtable_lookup_fast(&ilan->rhash_table, &iaddr->loc,
+ ila = rhashtable_lookup_fast(&ilan->xlat.rhash_table, &iaddr->loc,
rht_params);
while (ila) {
if (!ila_cmp_wildcards(ila, iaddr, ifindex))
@@ -179,7 +147,7 @@ static inline struct ila_map *ila_lookup_by_params(struct ila_xlat_params *xp,
{
struct ila_map *ila;
- ila = rhashtable_lookup_fast(&ilan->rhash_table,
+ ila = rhashtable_lookup_fast(&ilan->xlat.rhash_table,
&xp->ip.locator_match,
rht_params);
while (ila) {
@@ -196,9 +164,9 @@ static inline void ila_release(struct ila_map *ila)
kfree_rcu(ila, rcu);
}
-static void ila_free_cb(void *ptr, void *arg)
+static void ila_free_node(struct ila_map *ila)
{
- struct ila_map *ila = (struct ila_map *)ptr, *next;
+ struct ila_map *next;
/* Assume rcu_readlock held */
while (ila) {
@@ -208,6 +176,11 @@ static void ila_free_cb(void *ptr, void *arg)
}
}
+static void ila_free_cb(void *ptr, void *arg)
+{
+ ila_free_node((struct ila_map *)ptr);
+}
+
static int ila_xlat_addr(struct sk_buff *skb, bool sir2ila);
static unsigned int
@@ -235,7 +208,7 @@ static int ila_add_mapping(struct net *net, struct ila_xlat_params *xp)
spinlock_t *lock = ila_get_lock(ilan, xp->ip.locator_match);
int err = 0, order;
- if (!ilan->hooks_registered) {
+ if (!ilan->xlat.hooks_registered) {
/* We defer registering net hooks in the namespace until the
* first mapping is added.
*/
@@ -244,7 +217,7 @@ static int ila_add_mapping(struct net *net, struct ila_xlat_params *xp)
if (err)
return err;
- ilan->hooks_registered = true;
+ ilan->xlat.hooks_registered = true;
}
ila = kzalloc(sizeof(*ila), GFP_KERNEL);
@@ -259,12 +232,12 @@ static int ila_add_mapping(struct net *net, struct ila_xlat_params *xp)
spin_lock(lock);
- head = rhashtable_lookup_fast(&ilan->rhash_table,
+ head = rhashtable_lookup_fast(&ilan->xlat.rhash_table,
&xp->ip.locator_match,
rht_params);
if (!head) {
/* New entry for the rhash_table */
- err = rhashtable_lookup_insert_fast(&ilan->rhash_table,
+ err = rhashtable_lookup_insert_fast(&ilan->xlat.rhash_table,
&ila->node, rht_params);
} else {
struct ila_map *tila = head, *prev = NULL;
@@ -290,7 +263,7 @@ static int ila_add_mapping(struct net *net, struct ila_xlat_params *xp)
} else {
/* Make this ila new head */
RCU_INIT_POINTER(ila->next, head);
- err = rhashtable_replace_fast(&ilan->rhash_table,
+ err = rhashtable_replace_fast(&ilan->xlat.rhash_table,
&head->node,
&ila->node, rht_params);
if (err)
@@ -316,7 +289,7 @@ static int ila_del_mapping(struct net *net, struct ila_xlat_params *xp)
spin_lock(lock);
- head = rhashtable_lookup_fast(&ilan->rhash_table,
+ head = rhashtable_lookup_fast(&ilan->xlat.rhash_table,
&xp->ip.locator_match, rht_params);
ila = head;
@@ -346,15 +319,15 @@ static int ila_del_mapping(struct net *net, struct ila_xlat_params *xp)
* table
*/
err = rhashtable_replace_fast(
- &ilan->rhash_table, &ila->node,
+ &ilan->xlat.rhash_table, &ila->node,
&head->node, rht_params);
if (err)
goto out;
} else {
/* Entry no longer used */
- err = rhashtable_remove_fast(&ilan->rhash_table,
- &ila->node,
- rht_params);
+ err = rhashtable_remove_fast(
+ &ilan->xlat.rhash_table,
+ &ila->node, rht_params);
}
}
@@ -369,7 +342,7 @@ out:
return err;
}
-static int ila_nl_cmd_add_mapping(struct sk_buff *skb, struct genl_info *info)
+int ila_xlat_nl_cmd_add_mapping(struct sk_buff *skb, struct genl_info *info)
{
struct net *net = genl_info_net(info);
struct ila_xlat_params p;
@@ -382,7 +355,7 @@ static int ila_nl_cmd_add_mapping(struct sk_buff *skb, struct genl_info *info)
return ila_add_mapping(net, &p);
}
-static int ila_nl_cmd_del_mapping(struct sk_buff *skb, struct genl_info *info)
+int ila_xlat_nl_cmd_del_mapping(struct sk_buff *skb, struct genl_info *info)
{
struct net *net = genl_info_net(info);
struct ila_xlat_params xp;
@@ -397,6 +370,59 @@ static int ila_nl_cmd_del_mapping(struct sk_buff *skb, struct genl_info *info)
return 0;
}
+static inline spinlock_t *lock_from_ila_map(struct ila_net *ilan,
+ struct ila_map *ila)
+{
+ return ila_get_lock(ilan, ila->xp.ip.locator_match);
+}
+
+int ila_xlat_nl_cmd_flush(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net *net = genl_info_net(info);
+ struct ila_net *ilan = net_generic(net, ila_net_id);
+ struct rhashtable_iter iter;
+ struct ila_map *ila;
+ spinlock_t *lock;
+ int ret;
+
+ ret = rhashtable_walk_init(&ilan->xlat.rhash_table, &iter, GFP_KERNEL);
+ if (ret)
+ goto done;
+
+ rhashtable_walk_start(&iter);
+
+ for (;;) {
+ ila = rhashtable_walk_next(&iter);
+
+ if (IS_ERR(ila)) {
+ if (PTR_ERR(ila) == -EAGAIN)
+ continue;
+ ret = PTR_ERR(ila);
+ goto done;
+ } else if (!ila) {
+ break;
+ }
+
+ lock = lock_from_ila_map(ilan, ila);
+
+ spin_lock(lock);
+
+ ret = rhashtable_remove_fast(&ilan->xlat.rhash_table,
+ &ila->node, rht_params);
+ if (!ret)
+ ila_free_node(ila);
+
+ spin_unlock(lock);
+
+ if (ret)
+ break;
+ }
+
+done:
+ rhashtable_walk_stop(&iter);
+ return ret;
+}
+
static int ila_fill_info(struct ila_map *ila, struct sk_buff *msg)
{
if (nla_put_u64_64bit(msg, ILA_ATTR_LOCATOR,
@@ -434,7 +460,7 @@ nla_put_failure:
return -EMSGSIZE;
}
-static int ila_nl_cmd_get_mapping(struct sk_buff *skb, struct genl_info *info)
+int ila_xlat_nl_cmd_get_mapping(struct sk_buff *skb, struct genl_info *info)
{
struct net *net = genl_info_net(info);
struct ila_net *ilan = net_generic(net, ila_net_id);
@@ -475,27 +501,34 @@ out_free:
struct ila_dump_iter {
struct rhashtable_iter rhiter;
+ int skip;
};
-static int ila_nl_dump_start(struct netlink_callback *cb)
+int ila_xlat_nl_dump_start(struct netlink_callback *cb)
{
struct net *net = sock_net(cb->skb->sk);
struct ila_net *ilan = net_generic(net, ila_net_id);
- struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args[0];
+ struct ila_dump_iter *iter;
+ int ret;
- if (!iter) {
- iter = kmalloc(sizeof(*iter), GFP_KERNEL);
- if (!iter)
- return -ENOMEM;
+ iter = kmalloc(sizeof(*iter), GFP_KERNEL);
+ if (!iter)
+ return -ENOMEM;
- cb->args[0] = (long)iter;
+ ret = rhashtable_walk_init(&ilan->xlat.rhash_table, &iter->rhiter,
+ GFP_KERNEL);
+ if (ret) {
+ kfree(iter);
+ return ret;
}
- return rhashtable_walk_init(&ilan->rhash_table, &iter->rhiter,
- GFP_KERNEL);
+ iter->skip = 0;
+ cb->args[0] = (long)iter;
+
+ return ret;
}
-static int ila_nl_dump_done(struct netlink_callback *cb)
+int ila_xlat_nl_dump_done(struct netlink_callback *cb)
{
struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args[0];
@@ -506,24 +539,49 @@ static int ila_nl_dump_done(struct netlink_callback *cb)
return 0;
}
-static int ila_nl_dump(struct sk_buff *skb, struct netlink_callback *cb)
+int ila_xlat_nl_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args[0];
struct rhashtable_iter *rhiter = &iter->rhiter;
+ int skip = iter->skip;
struct ila_map *ila;
int ret;
rhashtable_walk_start(rhiter);
- for (;;) {
- ila = rhashtable_walk_next(rhiter);
+ /* Get first entry */
+ ila = rhashtable_walk_peek(rhiter);
+
+ if (ila && !IS_ERR(ila) && skip) {
+ /* Skip over visited entries */
+ while (ila && skip) {
+ /* Skip over any ila entries in this list that we
+ * have already dumped.
+ */
+ ila = rcu_access_pointer(ila->next);
+ skip--;
+ }
+ }
+
+ skip = 0;
+
+ for (;;) {
if (IS_ERR(ila)) {
- if (PTR_ERR(ila) == -EAGAIN)
- continue;
ret = PTR_ERR(ila);
- goto done;
+ if (ret == -EAGAIN) {
+ /* Table has changed and iter has reset. Return
+ * -EAGAIN to the application even if we have
+ * written data to the skb. The application
+ * needs to deal with this.
+ */
+
+ goto out_ret;
+ } else {
+ break;
+ }
} else if (!ila) {
+ ret = 0;
break;
}
@@ -532,90 +590,54 @@ static int ila_nl_dump(struct sk_buff *skb, struct netlink_callback *cb)
cb->nlh->nlmsg_seq, NLM_F_MULTI,
skb, ILA_CMD_GET);
if (ret)
- goto done;
+ goto out;
+ skip++;
ila = rcu_access_pointer(ila->next);
}
+
+ skip = 0;
+ ila = rhashtable_walk_next(rhiter);
}
- ret = skb->len;
+out:
+ iter->skip = skip;
+ ret = (skb->len ? : ret);
-done:
+out_ret:
rhashtable_walk_stop(rhiter);
return ret;
}
-static const struct genl_ops ila_nl_ops[] = {
- {
- .cmd = ILA_CMD_ADD,
- .doit = ila_nl_cmd_add_mapping,
- .policy = ila_nl_policy,
- .flags = GENL_ADMIN_PERM,
- },
- {
- .cmd = ILA_CMD_DEL,
- .doit = ila_nl_cmd_del_mapping,
- .policy = ila_nl_policy,
- .flags = GENL_ADMIN_PERM,
- },
- {
- .cmd = ILA_CMD_GET,
- .doit = ila_nl_cmd_get_mapping,
- .start = ila_nl_dump_start,
- .dumpit = ila_nl_dump,
- .done = ila_nl_dump_done,
- .policy = ila_nl_policy,
- },
-};
-
-static struct genl_family ila_nl_family __ro_after_init = {
- .hdrsize = 0,
- .name = ILA_GENL_NAME,
- .version = ILA_GENL_VERSION,
- .maxattr = ILA_ATTR_MAX,
- .netnsok = true,
- .parallel_ops = true,
- .module = THIS_MODULE,
- .ops = ila_nl_ops,
- .n_ops = ARRAY_SIZE(ila_nl_ops),
-};
-
#define ILA_HASH_TABLE_SIZE 1024
-static __net_init int ila_init_net(struct net *net)
+int ila_xlat_init_net(struct net *net)
{
- int err;
struct ila_net *ilan = net_generic(net, ila_net_id);
+ int err;
err = alloc_ila_locks(ilan);
if (err)
return err;
- rhashtable_init(&ilan->rhash_table, &rht_params);
+ rhashtable_init(&ilan->xlat.rhash_table, &rht_params);
return 0;
}
-static __net_exit void ila_exit_net(struct net *net)
+void ila_xlat_exit_net(struct net *net)
{
struct ila_net *ilan = net_generic(net, ila_net_id);
- rhashtable_free_and_destroy(&ilan->rhash_table, ila_free_cb, NULL);
+ rhashtable_free_and_destroy(&ilan->xlat.rhash_table, ila_free_cb, NULL);
- kvfree(ilan->locks);
+ free_bucket_spinlocks(ilan->xlat.locks);
- if (ilan->hooks_registered)
+ if (ilan->xlat.hooks_registered)
nf_unregister_net_hooks(net, ila_nf_hook_ops,
ARRAY_SIZE(ila_nf_hook_ops));
}
-static struct pernet_operations ila_net_ops = {
- .init = ila_init_net,
- .exit = ila_exit_net,
- .id = &ila_net_id,
- .size = sizeof(struct ila_net),
-};
-
static int ila_xlat_addr(struct sk_buff *skb, bool sir2ila)
{
struct ila_map *ila;
@@ -642,28 +664,3 @@ static int ila_xlat_addr(struct sk_buff *skb, bool sir2ila)
return 0;
}
-int __init ila_xlat_init(void)
-{
- int ret;
-
- ret = register_pernet_device(&ila_net_ops);
- if (ret)
- goto exit;
-
- ret = genl_register_family(&ila_nl_family);
- if (ret < 0)
- goto unregister;
-
- return 0;
-
-unregister:
- unregister_pernet_device(&ila_net_ops);
-exit:
- return ret;
-}
-
-void ila_xlat_fini(void)
-{
- genl_unregister_family(&ila_nl_family);
- unregister_pernet_device(&ila_net_ops);
-}
diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index 3eee7637bdfe..cb54a8a3c273 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -373,7 +373,6 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq,
if (olen > 0) {
struct msghdr msg;
struct flowi6 flowi6;
- struct sockcm_cookie sockc_junk;
struct ipcm6_cookie ipc6;
err = -ENOMEM;
@@ -392,7 +391,7 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq,
memset(&flowi6, 0, sizeof(flowi6));
ipc6.opt = fl->opt;
- err = ip6_datagram_send_ctl(net, sk, &msg, &flowi6, &ipc6, &sockc_junk);
+ err = ip6_datagram_send_ctl(net, sk, &msg, &flowi6, &ipc6);
if (err)
goto done;
err = -EINVAL;
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index cd2cfb04e5d8..fc7dd3a04360 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -989,6 +989,8 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL);
dsfield = key->tos;
+ if (!(tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT))
+ goto tx_err;
md = ip_tunnel_info_opts(tun_info);
if (!md)
goto tx_err;
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index f08d34491ece..6242682be876 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -47,17 +47,11 @@
#include <net/inet_ecn.h>
#include <net/dst_metadata.h>
-int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+static void ip6_rcv_finish_core(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
{
void (*edemux)(struct sk_buff *skb);
- /* if ingress device is enslaved to an L3 master device pass the
- * skb to its handler for processing
- */
- skb = l3mdev_ip6_rcv(skb);
- if (!skb)
- return NET_RX_SUCCESS;
-
if (net->ipv4.sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) {
const struct inet6_protocol *ipprot;
@@ -67,20 +61,73 @@ int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
}
if (!skb_valid_dst(skb))
ip6_route_input(skb);
+}
+
+int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ /* if ingress device is enslaved to an L3 master device pass the
+ * skb to its handler for processing
+ */
+ skb = l3mdev_ip6_rcv(skb);
+ if (!skb)
+ return NET_RX_SUCCESS;
+ ip6_rcv_finish_core(net, sk, skb);
return dst_input(skb);
}
-int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
+static void ip6_sublist_rcv_finish(struct list_head *head)
+{
+ struct sk_buff *skb, *next;
+
+ list_for_each_entry_safe(skb, next, head, list)
+ dst_input(skb);
+}
+
+static void ip6_list_rcv_finish(struct net *net, struct sock *sk,
+ struct list_head *head)
+{
+ struct dst_entry *curr_dst = NULL;
+ struct sk_buff *skb, *next;
+ struct list_head sublist;
+
+ INIT_LIST_HEAD(&sublist);
+ list_for_each_entry_safe(skb, next, head, list) {
+ struct dst_entry *dst;
+
+ list_del(&skb->list);
+ /* if ingress device is enslaved to an L3 master device pass the
+ * skb to its handler for processing
+ */
+ skb = l3mdev_ip6_rcv(skb);
+ if (!skb)
+ continue;
+ ip6_rcv_finish_core(net, sk, skb);
+ dst = skb_dst(skb);
+ if (curr_dst != dst) {
+ /* dispatch old sublist */
+ if (!list_empty(&sublist))
+ ip6_sublist_rcv_finish(&sublist);
+ /* start new sublist */
+ INIT_LIST_HEAD(&sublist);
+ curr_dst = dst;
+ }
+ list_add_tail(&skb->list, &sublist);
+ }
+ /* dispatch final sublist */
+ ip6_sublist_rcv_finish(&sublist);
+}
+
+static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, struct net_device *dev,
+ struct net *net)
{
const struct ipv6hdr *hdr;
u32 pkt_len;
struct inet6_dev *idev;
- struct net *net = dev_net(skb->dev);
if (skb->pkt_type == PACKET_OTHERHOST) {
kfree_skb(skb);
- return NET_RX_DROP;
+ return NULL;
}
rcu_read_lock();
@@ -196,7 +243,7 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
if (ipv6_parse_hopopts(skb) < 0) {
__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
rcu_read_unlock();
- return NET_RX_DROP;
+ return NULL;
}
}
@@ -205,15 +252,67 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
/* Must drop socket now because of tproxy. */
skb_orphan(skb);
- return NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING,
- net, NULL, skb, dev, NULL,
- ip6_rcv_finish);
+ return skb;
err:
__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
drop:
rcu_read_unlock();
kfree_skb(skb);
- return NET_RX_DROP;
+ return NULL;
+}
+
+int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
+{
+ struct net *net = dev_net(skb->dev);
+
+ skb = ip6_rcv_core(skb, dev, net);
+ if (skb == NULL)
+ return NET_RX_DROP;
+ return NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING,
+ net, NULL, skb, dev, NULL,
+ ip6_rcv_finish);
+}
+
+static void ip6_sublist_rcv(struct list_head *head, struct net_device *dev,
+ struct net *net)
+{
+ NF_HOOK_LIST(NFPROTO_IPV6, NF_INET_PRE_ROUTING, net, NULL,
+ head, dev, NULL, ip6_rcv_finish);
+ ip6_list_rcv_finish(net, NULL, head);
+}
+
+/* Receive a list of IPv6 packets */
+void ipv6_list_rcv(struct list_head *head, struct packet_type *pt,
+ struct net_device *orig_dev)
+{
+ struct net_device *curr_dev = NULL;
+ struct net *curr_net = NULL;
+ struct sk_buff *skb, *next;
+ struct list_head sublist;
+
+ INIT_LIST_HEAD(&sublist);
+ list_for_each_entry_safe(skb, next, head, list) {
+ struct net_device *dev = skb->dev;
+ struct net *net = dev_net(dev);
+
+ list_del(&skb->list);
+ skb = ip6_rcv_core(skb, dev, net);
+ if (skb == NULL)
+ continue;
+
+ if (curr_dev != dev || curr_net != net) {
+ /* dispatch old sublist */
+ if (!list_empty(&sublist))
+ ip6_sublist_rcv(&sublist, curr_dev, curr_net);
+ /* start new sublist */
+ INIT_LIST_HEAD(&sublist);
+ curr_dev = dev;
+ curr_net = net;
+ }
+ list_add_tail(&skb->list, &sublist);
+ }
+ /* dispatch final sublist */
+ ip6_sublist_rcv(&sublist, curr_dev, curr_net);
}
/*
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index 5b3f2f89ef41..37ff4805b20c 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -163,11 +163,11 @@ static int ipv6_exthdrs_len(struct ipv6hdr *iph,
return len;
}
-static struct sk_buff **ipv6_gro_receive(struct sk_buff **head,
- struct sk_buff *skb)
+static struct sk_buff *ipv6_gro_receive(struct list_head *head,
+ struct sk_buff *skb)
{
const struct net_offload *ops;
- struct sk_buff **pp = NULL;
+ struct sk_buff *pp = NULL;
struct sk_buff *p;
struct ipv6hdr *iph;
unsigned int nlen;
@@ -214,7 +214,7 @@ static struct sk_buff **ipv6_gro_receive(struct sk_buff **head,
flush--;
nlen = skb_network_header_len(skb);
- for (p = *head; p; p = p->next) {
+ list_for_each_entry(p, head, list) {
const struct ipv6hdr *iph2;
__be32 first_word; /* <Version:4><Traffic_Class:8><Flow_Label:20> */
@@ -263,8 +263,8 @@ out:
return pp;
}
-static struct sk_buff **sit_ip6ip6_gro_receive(struct sk_buff **head,
- struct sk_buff *skb)
+static struct sk_buff *sit_ip6ip6_gro_receive(struct list_head *head,
+ struct sk_buff *skb)
{
/* Common GRO receive for SIT and IP6IP6 */
@@ -278,8 +278,8 @@ static struct sk_buff **sit_ip6ip6_gro_receive(struct sk_buff **head,
return ipv6_gro_receive(head, skb);
}
-static struct sk_buff **ip4ip6_gro_receive(struct sk_buff **head,
- struct sk_buff *skb)
+static struct sk_buff *ip4ip6_gro_receive(struct list_head *head,
+ struct sk_buff *skb)
{
/* Common GRO receive for SIT and IP6IP6 */
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index a14fb4fcdf18..8047fd41ba88 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1219,13 +1219,16 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
if (mtu < IPV6_MIN_MTU)
return -EINVAL;
cork->base.fragsize = mtu;
- cork->base.gso_size = sk->sk_type == SOCK_DGRAM &&
- sk->sk_protocol == IPPROTO_UDP ? ipc6->gso_size : 0;
+ cork->base.gso_size = ipc6->gso_size;
+ cork->base.tx_flags = 0;
+ sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
if (dst_allfrag(xfrm_dst_path(&rt->dst)))
cork->base.flags |= IPCORK_ALLFRAG;
cork->base.length = 0;
+ cork->base.transmit_time = ipc6->sockc.transmit_time;
+
return 0;
}
@@ -1238,8 +1241,7 @@ static int __ip6_append_data(struct sock *sk,
int getfrag(void *from, char *to, int offset,
int len, int odd, struct sk_buff *skb),
void *from, int length, int transhdrlen,
- unsigned int flags, struct ipcm6_cookie *ipc6,
- const struct sockcm_cookie *sockc)
+ unsigned int flags, struct ipcm6_cookie *ipc6)
{
struct sk_buff *skb, *skb_prev = NULL;
unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
@@ -1249,7 +1251,6 @@ static int __ip6_append_data(struct sock *sk,
int copy;
int err;
int offset = 0;
- __u8 tx_flags = 0;
u32 tskey = 0;
struct rt6_info *rt = (struct rt6_info *)cork->dst;
struct ipv6_txoptions *opt = v6_cork->opt;
@@ -1268,6 +1269,10 @@ static int __ip6_append_data(struct sock *sk,
mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
orig_mtu = mtu;
+ if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
+ sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
+ tskey = sk->sk_tskey++;
+
hh_len = LL_RESERVED_SPACE(rt->dst.dev);
fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
@@ -1317,13 +1322,6 @@ emsgsize:
rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
csummode = CHECKSUM_PARTIAL;
- if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
- sock_tx_timestamp(sk, sockc->tsflags, &tx_flags);
- if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
- sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
- tskey = sk->sk_tskey++;
- }
-
/*
* Let's try using as much space as possible.
* Use MTU if total length of the message fits into the MTU.
@@ -1442,8 +1440,8 @@ alloc_new_skb:
dst_exthdrlen);
/* Only the initial fragment is time stamped */
- skb_shinfo(skb)->tx_flags = tx_flags;
- tx_flags = 0;
+ skb_shinfo(skb)->tx_flags = cork->tx_flags;
+ cork->tx_flags = 0;
skb_shinfo(skb)->tskey = tskey;
tskey = 0;
@@ -1560,8 +1558,7 @@ int ip6_append_data(struct sock *sk,
int odd, struct sk_buff *skb),
void *from, int length, int transhdrlen,
struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
- struct rt6_info *rt, unsigned int flags,
- const struct sockcm_cookie *sockc)
+ struct rt6_info *rt, unsigned int flags)
{
struct inet_sock *inet = inet_sk(sk);
struct ipv6_pinfo *np = inet6_sk(sk);
@@ -1589,7 +1586,7 @@ int ip6_append_data(struct sock *sk,
return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
&np->cork, sk_page_frag(sk), getfrag,
- from, length, transhdrlen, flags, ipc6, sockc);
+ from, length, transhdrlen, flags, ipc6);
}
EXPORT_SYMBOL_GPL(ip6_append_data);
@@ -1673,6 +1670,8 @@ struct sk_buff *__ip6_make_skb(struct sock *sk,
skb->priority = sk->sk_priority;
skb->mark = sk->sk_mark;
+ skb->tstamp = cork->base.transmit_time;
+
skb_dst_set(skb, dst_clone(&rt->dst));
IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
if (proto == IPPROTO_ICMPV6) {
@@ -1747,8 +1746,7 @@ struct sk_buff *ip6_make_skb(struct sock *sk,
void *from, int length, int transhdrlen,
struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
struct rt6_info *rt, unsigned int flags,
- struct inet_cork_full *cork,
- const struct sockcm_cookie *sockc)
+ struct inet_cork_full *cork)
{
struct inet6_cork v6_cork;
struct sk_buff_head queue;
@@ -1776,7 +1774,7 @@ struct sk_buff *ip6_make_skb(struct sock *sk,
err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
&current->task_frag, getfrag, from,
length + exthdrlen, transhdrlen + exthdrlen,
- flags, ipc6, sockc);
+ flags, ipc6);
if (err) {
__ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
return ERR_PTR(err);
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 0d0f0053bb11..d0b7e0249c13 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -32,6 +32,7 @@
#include <linux/seq_file.h>
#include <linux/init.h>
#include <linux/compat.h>
+#include <linux/rhashtable.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
#include <net/raw.h>
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 568ca4187cd1..c0cac9cc3a28 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -500,7 +500,6 @@ sticky_done:
struct ipv6_txoptions *opt = NULL;
struct msghdr msg;
struct flowi6 fl6;
- struct sockcm_cookie sockc_junk;
struct ipcm6_cookie ipc6;
memset(&fl6, 0, sizeof(fl6));
@@ -533,7 +532,7 @@ sticky_done:
msg.msg_control = (void *)(opt+1);
ipc6.opt = opt;
- retv = ip6_datagram_send_ctl(net, sk, &msg, &fl6, &ipc6, &sockc_junk);
+ retv = ip6_datagram_send_ctl(net, sk, &msg, &fl6, &ipc6);
if (retv)
goto done;
update:
diff --git a/net/ipv6/netfilter/nf_log_ipv6.c b/net/ipv6/netfilter/nf_log_ipv6.c
index b397a8fe88b9..c6bf580d0f33 100644
--- a/net/ipv6/netfilter/nf_log_ipv6.c
+++ b/net/ipv6/netfilter/nf_log_ipv6.c
@@ -36,7 +36,7 @@ static const struct nf_loginfo default_loginfo = {
};
/* One level of recursion won't kill us */
-static void dump_ipv6_packet(struct nf_log_buf *m,
+static void dump_ipv6_packet(struct net *net, struct nf_log_buf *m,
const struct nf_loginfo *info,
const struct sk_buff *skb, unsigned int ip6hoff,
int recurse)
@@ -258,7 +258,7 @@ static void dump_ipv6_packet(struct nf_log_buf *m,
/* Max length: 3+maxlen */
if (recurse) {
nf_log_buf_add(m, "[");
- dump_ipv6_packet(m, info, skb,
+ dump_ipv6_packet(net, m, info, skb,
ptr + sizeof(_icmp6h), 0);
nf_log_buf_add(m, "] ");
}
@@ -278,7 +278,7 @@ static void dump_ipv6_packet(struct nf_log_buf *m,
/* Max length: 15 "UID=4294967295 " */
if ((logflags & NF_LOG_UID) && recurse)
- nf_log_dump_sk_uid_gid(m, skb->sk);
+ nf_log_dump_sk_uid_gid(net, m, skb->sk);
/* Max length: 16 "MARK=0xFFFFFFFF " */
if (recurse && skb->mark)
@@ -365,7 +365,7 @@ static void nf_log_ip6_packet(struct net *net, u_int8_t pf,
if (in != NULL)
dump_ipv6_mac_header(m, loginfo, skb);
- dump_ipv6_packet(m, loginfo, skb, skb_network_offset(skb), 1);
+ dump_ipv6_packet(net, m, loginfo, skb, skb_network_offset(skb), 1);
nf_log_buf_close(m);
}
diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
index 96f56bf49a30..4c04bccc7417 100644
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -62,7 +62,6 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
struct dst_entry *dst;
struct rt6_info *rt;
struct pingfakehdr pfh;
- struct sockcm_cookie junk = {0};
struct ipcm6_cookie ipc6;
pr_debug("ping_v6_sendmsg(sk=%p,sk->num=%u)\n", inet, inet->inet_num);
@@ -119,7 +118,7 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
fl6.fl6_icmp_code = user_icmph.icmp6_code;
security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
- ipc6.tclass = np->tclass;
+ ipcm6_init_sk(&ipc6, np);
fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel);
dst = ip6_sk_dst_lookup_flow(sk, &fl6, daddr, false);
@@ -142,13 +141,11 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
pfh.family = AF_INET6;
ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
- ipc6.dontfrag = np->dontfrag;
- ipc6.opt = NULL;
lock_sock(sk);
err = ip6_append_data(sk, ping_getfrag, &pfh, len,
0, &ipc6, &fl6, rt,
- MSG_DONTWAIT, &junk);
+ MSG_DONTWAIT);
if (err) {
ICMP6_INC_STATS(sock_net(sk), rt->rt6i_idev,
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index afc307c89d1a..413d98bf24f4 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -620,7 +620,7 @@ out:
static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length,
struct flowi6 *fl6, struct dst_entry **dstp,
- unsigned int flags)
+ unsigned int flags, const struct sockcm_cookie *sockc)
{
struct ipv6_pinfo *np = inet6_sk(sk);
struct net *net = sock_net(sk);
@@ -650,6 +650,7 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length,
skb->protocol = htons(ETH_P_IPV6);
skb->priority = sk->sk_priority;
skb->mark = sk->sk_mark;
+ skb->tstamp = sockc->transmit_time;
skb_dst_set(skb, &rt->dst);
*dstp = NULL;
@@ -766,7 +767,6 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
struct dst_entry *dst = NULL;
struct raw6_frag_vec rfv;
struct flowi6 fl6;
- struct sockcm_cookie sockc;
struct ipcm6_cookie ipc6;
int addr_len = msg->msg_namelen;
u16 proto;
@@ -790,10 +790,8 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
fl6.flowi6_mark = sk->sk_mark;
fl6.flowi6_uid = sk->sk_uid;
- ipc6.hlimit = -1;
- ipc6.tclass = -1;
- ipc6.dontfrag = -1;
- ipc6.opt = NULL;
+ ipcm6_init(&ipc6);
+ ipc6.sockc.tsflags = sk->sk_tsflags;
if (sin6) {
if (addr_len < SIN6_LEN_RFC2133)
@@ -847,14 +845,13 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
if (fl6.flowi6_oif == 0)
fl6.flowi6_oif = sk->sk_bound_dev_if;
- sockc.tsflags = sk->sk_tsflags;
if (msg->msg_controllen) {
opt = &opt_space;
memset(opt, 0, sizeof(struct ipv6_txoptions));
opt->tot_len = sizeof(struct ipv6_txoptions);
ipc6.opt = opt;
- err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, &ipc6, &sockc);
+ err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, &ipc6);
if (err < 0) {
fl6_sock_release(flowlabel);
return err;
@@ -921,13 +918,14 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
back_from_confirm:
if (inet->hdrincl)
- err = rawv6_send_hdrinc(sk, msg, len, &fl6, &dst, msg->msg_flags);
+ err = rawv6_send_hdrinc(sk, msg, len, &fl6, &dst,
+ msg->msg_flags, &ipc6.sockc);
else {
ipc6.opt = opt;
lock_sock(sk);
err = ip6_append_data(sk, raw6_getfrag, &rfv,
len, 0, &ipc6, &fl6, (struct rt6_info *)dst,
- msg->msg_flags, &sockc);
+ msg->msg_flags);
if (err)
ip6_flush_pending_frames(sk);
diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c
index 0fdf2a55e746..8d0ba757a46c 100644
--- a/net/ipv6/seg6.c
+++ b/net/ipv6/seg6.c
@@ -17,6 +17,7 @@
#include <linux/net.h>
#include <linux/in6.h>
#include <linux/slab.h>
+#include <linux/rhashtable.h>
#include <net/ipv6.h>
#include <net/protocol.h>
diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c
index 558fe8cc6d43..8546f94f30d4 100644
--- a/net/ipv6/seg6_hmac.c
+++ b/net/ipv6/seg6_hmac.c
@@ -22,6 +22,7 @@
#include <linux/icmpv6.h>
#include <linux/mroute6.h>
#include <linux/slab.h>
+#include <linux/rhashtable.h>
#include <linux/netfilter.h>
#include <linux/netfilter_ipv6.h>
diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c
index 278e49cd67d4..e72947c99454 100644
--- a/net/ipv6/tcpv6_offload.c
+++ b/net/ipv6/tcpv6_offload.c
@@ -15,8 +15,8 @@
#include <net/ip6_checksum.h>
#include "ip6_offload.h"
-static struct sk_buff **tcp6_gro_receive(struct sk_buff **head,
- struct sk_buff *skb)
+static struct sk_buff *tcp6_gro_receive(struct list_head *head,
+ struct sk_buff *skb)
{
/* Don't bother verifying checksum if we're going to flush anyway. */
if (!NAPI_GRO_CB(skb)->flush &&
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index e6645cae403e..f6b96956a8ed 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1141,13 +1141,10 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
int err;
int is_udplite = IS_UDPLITE(sk);
int (*getfrag)(void *, char *, int, int, int, struct sk_buff *);
- struct sockcm_cookie sockc;
- ipc6.hlimit = -1;
- ipc6.tclass = -1;
- ipc6.dontfrag = -1;
+ ipcm6_init(&ipc6);
ipc6.gso_size = up->gso_size;
- sockc.tsflags = sk->sk_tsflags;
+ ipc6.sockc.tsflags = sk->sk_tsflags;
/* destination address check */
if (sin6) {
@@ -1282,7 +1279,7 @@ do_udp_sendmsg:
err = udp_cmsg_send(sk, msg, &ipc6.gso_size);
if (err > 0)
err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6,
- &ipc6, &sockc);
+ &ipc6);
if (err < 0) {
fl6_sock_release(flowlabel);
return err;
@@ -1376,7 +1373,7 @@ back_from_confirm:
skb = ip6_make_skb(sk, getfrag, msg, ulen,
sizeof(struct udphdr), &ipc6,
&fl6, (struct rt6_info *)dst,
- msg->msg_flags, &cork, &sockc);
+ msg->msg_flags, &cork);
err = PTR_ERR(skb);
if (!IS_ERR_OR_NULL(skb))
err = udp_v6_send_skb(skb, &fl6, &cork.base);
@@ -1402,7 +1399,7 @@ do_append_data:
up->len += ulen;
err = ip6_append_data(sk, getfrag, msg, ulen, sizeof(struct udphdr),
&ipc6, &fl6, (struct rt6_info *)dst,
- corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags, &sockc);
+ corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags);
if (err)
udp_v6_flush_pending_frames(sk);
else if (!corkreq)
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index 03a2ff3fe1e6..95dee9ca8d22 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -114,8 +114,8 @@ out:
return segs;
}
-static struct sk_buff **udp6_gro_receive(struct sk_buff **head,
- struct sk_buff *skb)
+static struct sk_buff *udp6_gro_receive(struct list_head *head,
+ struct sk_buff *skb)
{
struct udphdr *uh = udp_gro_udphdr(skb);
diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index 40261cb68e83..1ea285bad84b 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -322,8 +322,7 @@ int l2tp_session_register(struct l2tp_session *session,
if (tunnel->version == L2TP_HDR_VER_3) {
pn = l2tp_pernet(tunnel->l2tp_net);
- g_head = l2tp_session_id_hash_2(l2tp_pernet(tunnel->l2tp_net),
- session->session_id);
+ g_head = l2tp_session_id_hash_2(pn, session->session_id);
spin_lock_bh(&pn->l2tp_session_hlist_lock);
@@ -783,7 +782,7 @@ EXPORT_SYMBOL(l2tp_recv_common);
/* Drop skbs from the session's reorder_q
*/
-int l2tp_session_queue_purge(struct l2tp_session *session)
+static int l2tp_session_queue_purge(struct l2tp_session *session)
{
struct sk_buff *skb = NULL;
BUG_ON(!session);
@@ -794,7 +793,6 @@ int l2tp_session_queue_purge(struct l2tp_session *session)
}
return 0;
}
-EXPORT_SYMBOL_GPL(l2tp_session_queue_purge);
/* Internal UDP receive frame. Do the real work of receiving an L2TP data frame
* here. The skb is not on a list when we get here.
@@ -1009,8 +1007,8 @@ static int l2tp_build_l2tpv3_header(struct l2tp_session *session, void *buf)
return bufp - optr;
}
-static int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb,
- struct flowi *fl, size_t data_len)
+static void l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb,
+ struct flowi *fl, size_t data_len)
{
struct l2tp_tunnel *tunnel = session->tunnel;
unsigned int len = skb->len;
@@ -1052,8 +1050,6 @@ static int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb,
atomic_long_inc(&tunnel->stats.tx_errors);
atomic_long_inc(&session->stats.tx_errors);
}
-
- return 0;
}
/* If caller requires the skb to have a ppp header, the header must be
@@ -1193,7 +1189,7 @@ end:
/* When the tunnel is closed, all the attached sessions need to go too.
*/
-void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel)
+static void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel)
{
int hash;
struct hlist_node *walk;
@@ -1242,7 +1238,6 @@ again:
}
write_unlock_bh(&tunnel->hlist_lock);
}
-EXPORT_SYMBOL_GPL(l2tp_tunnel_closeall);
/* Tunnel socket destroy hook for UDP encapsulation */
static void l2tp_udp_encap_destroy(struct sock *sk)
diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h
index c199020f8a8a..a5c09d3a5698 100644
--- a/net/l2tp/l2tp_core.h
+++ b/net/l2tp/l2tp_core.h
@@ -180,9 +180,6 @@ struct l2tp_tunnel {
struct net *l2tp_net; /* the net we belong to */
refcount_t ref_count;
-#ifdef CONFIG_DEBUG_FS
- void (*show)(struct seq_file *m, void *arg);
-#endif
int (*recv_payload_hook)(struct sk_buff *skb);
void (*old_sk_destruct)(struct sock *);
struct sock *sock; /* Parent socket */
@@ -190,8 +187,6 @@ struct l2tp_tunnel {
* was created by userspace */
struct work_struct del_work;
-
- uint8_t priv[0]; /* private data */
};
struct l2tp_nl_cmd_ops {
@@ -201,11 +196,6 @@ struct l2tp_nl_cmd_ops {
int (*session_delete)(struct l2tp_session *session);
};
-static inline void *l2tp_tunnel_priv(struct l2tp_tunnel *tunnel)
-{
- return &tunnel->priv[0];
-}
-
static inline void *l2tp_session_priv(struct l2tp_session *session)
{
return &session->priv[0];
@@ -229,7 +219,6 @@ int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id,
int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net,
struct l2tp_tunnel_cfg *cfg);
-void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel);
void l2tp_tunnel_delete(struct l2tp_tunnel *tunnel);
struct l2tp_session *l2tp_session_create(int priv_size,
struct l2tp_tunnel *tunnel,
@@ -244,7 +233,6 @@ void l2tp_session_free(struct l2tp_session *session);
void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb,
unsigned char *ptr, unsigned char *optr, u16 hdrflags,
int length, int (*payload_hook)(struct sk_buff *skb));
-int l2tp_session_queue_purge(struct l2tp_session *session);
int l2tp_udp_encap_recv(struct sock *sk, struct sk_buff *skb);
void l2tp_session_set_header_len(struct l2tp_session *session, int version);
diff --git a/net/l2tp/l2tp_debugfs.c b/net/l2tp/l2tp_debugfs.c
index e87686f7d63c..b5d7dde003ef 100644
--- a/net/l2tp/l2tp_debugfs.c
+++ b/net/l2tp/l2tp_debugfs.c
@@ -177,9 +177,6 @@ static void l2tp_dfs_seq_tunnel_show(struct seq_file *m, void *v)
atomic_long_read(&tunnel->stats.rx_packets),
atomic_long_read(&tunnel->stats.rx_bytes),
atomic_long_read(&tunnel->stats.rx_errors));
-
- if (tunnel->show != NULL)
- tunnel->show(m, tunnel);
}
static void l2tp_dfs_seq_session_show(struct seq_file *m, void *v)
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index 957369192ca1..672e5b753738 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -500,7 +500,6 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
struct ip6_flowlabel *flowlabel = NULL;
struct dst_entry *dst = NULL;
struct flowi6 fl6;
- struct sockcm_cookie sockc_unused = {0};
struct ipcm6_cookie ipc6;
int addr_len = msg->msg_namelen;
int transhdrlen = 4; /* zero session-id */
@@ -525,9 +524,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
fl6.flowi6_mark = sk->sk_mark;
fl6.flowi6_uid = sk->sk_uid;
- ipc6.hlimit = -1;
- ipc6.tclass = -1;
- ipc6.dontfrag = -1;
+ ipcm6_init(&ipc6);
if (lsa) {
if (addr_len < SIN6_LEN_RFC2133)
@@ -575,8 +572,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
opt->tot_len = sizeof(struct ipv6_txoptions);
ipc6.opt = opt;
- err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, &ipc6,
- &sockc_unused);
+ err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, &ipc6);
if (err < 0) {
fl6_sock_release(flowlabel);
return err;
@@ -641,7 +637,7 @@ back_from_confirm:
err = ip6_append_data(sk, ip_generic_getfrag, msg,
ulen, transhdrlen, &ipc6,
&fl6, (struct rt6_info *)dst,
- msg->msg_flags, &sockc_unused);
+ msg->msg_flags);
if (err)
ip6_flush_pending_frames(sk);
else if (!(msg->msg_flags & MSG_MORE))
diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index e398797878a9..9ac02c93df98 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -424,12 +424,6 @@ static void pppol2tp_put_sk(struct rcu_head *head)
sock_put(ps->__sk);
}
-/* Called by l2tp_core when a session socket is being closed.
- */
-static void pppol2tp_session_close(struct l2tp_session *session)
-{
-}
-
/* Really kill the session socket. (Called from sock_put() if
* refcnt == 0.)
*/
@@ -573,7 +567,6 @@ static void pppol2tp_session_init(struct l2tp_session *session)
struct dst_entry *dst;
session->recv_skb = pppol2tp_recv;
- session->session_close = pppol2tp_session_close;
#if IS_ENABLED(CONFIG_L2TP_DEBUGFS)
session->show = pppol2tp_show;
#endif
@@ -595,40 +588,113 @@ static void pppol2tp_session_init(struct l2tp_session *session)
}
}
+struct l2tp_connect_info {
+ u8 version;
+ int fd;
+ u32 tunnel_id;
+ u32 peer_tunnel_id;
+ u32 session_id;
+ u32 peer_session_id;
+};
+
+static int pppol2tp_sockaddr_get_info(const void *sa, int sa_len,
+ struct l2tp_connect_info *info)
+{
+ switch (sa_len) {
+ case sizeof(struct sockaddr_pppol2tp):
+ {
+ const struct sockaddr_pppol2tp *sa_v2in4 = sa;
+
+ if (sa_v2in4->sa_protocol != PX_PROTO_OL2TP)
+ return -EINVAL;
+
+ info->version = 2;
+ info->fd = sa_v2in4->pppol2tp.fd;
+ info->tunnel_id = sa_v2in4->pppol2tp.s_tunnel;
+ info->peer_tunnel_id = sa_v2in4->pppol2tp.d_tunnel;
+ info->session_id = sa_v2in4->pppol2tp.s_session;
+ info->peer_session_id = sa_v2in4->pppol2tp.d_session;
+
+ break;
+ }
+ case sizeof(struct sockaddr_pppol2tpv3):
+ {
+ const struct sockaddr_pppol2tpv3 *sa_v3in4 = sa;
+
+ if (sa_v3in4->sa_protocol != PX_PROTO_OL2TP)
+ return -EINVAL;
+
+ info->version = 3;
+ info->fd = sa_v3in4->pppol2tp.fd;
+ info->tunnel_id = sa_v3in4->pppol2tp.s_tunnel;
+ info->peer_tunnel_id = sa_v3in4->pppol2tp.d_tunnel;
+ info->session_id = sa_v3in4->pppol2tp.s_session;
+ info->peer_session_id = sa_v3in4->pppol2tp.d_session;
+
+ break;
+ }
+ case sizeof(struct sockaddr_pppol2tpin6):
+ {
+ const struct sockaddr_pppol2tpin6 *sa_v2in6 = sa;
+
+ if (sa_v2in6->sa_protocol != PX_PROTO_OL2TP)
+ return -EINVAL;
+
+ info->version = 2;
+ info->fd = sa_v2in6->pppol2tp.fd;
+ info->tunnel_id = sa_v2in6->pppol2tp.s_tunnel;
+ info->peer_tunnel_id = sa_v2in6->pppol2tp.d_tunnel;
+ info->session_id = sa_v2in6->pppol2tp.s_session;
+ info->peer_session_id = sa_v2in6->pppol2tp.d_session;
+
+ break;
+ }
+ case sizeof(struct sockaddr_pppol2tpv3in6):
+ {
+ const struct sockaddr_pppol2tpv3in6 *sa_v3in6 = sa;
+
+ if (sa_v3in6->sa_protocol != PX_PROTO_OL2TP)
+ return -EINVAL;
+
+ info->version = 3;
+ info->fd = sa_v3in6->pppol2tp.fd;
+ info->tunnel_id = sa_v3in6->pppol2tp.s_tunnel;
+ info->peer_tunnel_id = sa_v3in6->pppol2tp.d_tunnel;
+ info->session_id = sa_v3in6->pppol2tp.s_session;
+ info->peer_session_id = sa_v3in6->pppol2tp.d_session;
+
+ break;
+ }
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
/* connect() handler. Attach a PPPoX socket to a tunnel UDP socket
*/
static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
int sockaddr_len, int flags)
{
struct sock *sk = sock->sk;
- struct sockaddr_pppol2tp *sp = (struct sockaddr_pppol2tp *) uservaddr;
struct pppox_sock *po = pppox_sk(sk);
struct l2tp_session *session = NULL;
+ struct l2tp_connect_info info;
struct l2tp_tunnel *tunnel;
struct pppol2tp_session *ps;
struct l2tp_session_cfg cfg = { 0, };
- int error = 0;
- u32 tunnel_id, peer_tunnel_id;
- u32 session_id, peer_session_id;
bool drop_refcnt = false;
bool drop_tunnel = false;
bool new_session = false;
bool new_tunnel = false;
- int ver = 2;
- int fd;
-
- lock_sock(sk);
-
- error = -EINVAL;
+ int error;
- if (sockaddr_len != sizeof(struct sockaddr_pppol2tp) &&
- sockaddr_len != sizeof(struct sockaddr_pppol2tpv3) &&
- sockaddr_len != sizeof(struct sockaddr_pppol2tpin6) &&
- sockaddr_len != sizeof(struct sockaddr_pppol2tpv3in6))
- goto end;
+ error = pppol2tp_sockaddr_get_info(uservaddr, sockaddr_len, &info);
+ if (error < 0)
+ return error;
- if (sp->sa_protocol != PX_PROTO_OL2TP)
- goto end;
+ lock_sock(sk);
/* Check for already bound sockets */
error = -EBUSY;
@@ -640,56 +706,12 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
if (sk->sk_user_data)
goto end; /* socket is already attached */
- /* Get params from socket address. Handle L2TPv2 and L2TPv3.
- * This is nasty because there are different sockaddr_pppol2tp
- * structs for L2TPv2, L2TPv3, over IPv4 and IPv6. We use
- * the sockaddr size to determine which structure the caller
- * is using.
- */
- peer_tunnel_id = 0;
- if (sockaddr_len == sizeof(struct sockaddr_pppol2tp)) {
- fd = sp->pppol2tp.fd;
- tunnel_id = sp->pppol2tp.s_tunnel;
- peer_tunnel_id = sp->pppol2tp.d_tunnel;
- session_id = sp->pppol2tp.s_session;
- peer_session_id = sp->pppol2tp.d_session;
- } else if (sockaddr_len == sizeof(struct sockaddr_pppol2tpv3)) {
- struct sockaddr_pppol2tpv3 *sp3 =
- (struct sockaddr_pppol2tpv3 *) sp;
- ver = 3;
- fd = sp3->pppol2tp.fd;
- tunnel_id = sp3->pppol2tp.s_tunnel;
- peer_tunnel_id = sp3->pppol2tp.d_tunnel;
- session_id = sp3->pppol2tp.s_session;
- peer_session_id = sp3->pppol2tp.d_session;
- } else if (sockaddr_len == sizeof(struct sockaddr_pppol2tpin6)) {
- struct sockaddr_pppol2tpin6 *sp6 =
- (struct sockaddr_pppol2tpin6 *) sp;
- fd = sp6->pppol2tp.fd;
- tunnel_id = sp6->pppol2tp.s_tunnel;
- peer_tunnel_id = sp6->pppol2tp.d_tunnel;
- session_id = sp6->pppol2tp.s_session;
- peer_session_id = sp6->pppol2tp.d_session;
- } else if (sockaddr_len == sizeof(struct sockaddr_pppol2tpv3in6)) {
- struct sockaddr_pppol2tpv3in6 *sp6 =
- (struct sockaddr_pppol2tpv3in6 *) sp;
- ver = 3;
- fd = sp6->pppol2tp.fd;
- tunnel_id = sp6->pppol2tp.s_tunnel;
- peer_tunnel_id = sp6->pppol2tp.d_tunnel;
- session_id = sp6->pppol2tp.s_session;
- peer_session_id = sp6->pppol2tp.d_session;
- } else {
- error = -EINVAL;
- goto end; /* bad socket address */
- }
-
/* Don't bind if tunnel_id is 0 */
error = -EINVAL;
- if (tunnel_id == 0)
+ if (!info.tunnel_id)
goto end;
- tunnel = l2tp_tunnel_get(sock_net(sk), tunnel_id);
+ tunnel = l2tp_tunnel_get(sock_net(sk), info.tunnel_id);
if (tunnel)
drop_tunnel = true;
@@ -697,7 +719,7 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
* peer_session_id is 0. Otherwise look up tunnel using supplied
* tunnel id.
*/
- if ((session_id == 0) && (peer_session_id == 0)) {
+ if (!info.session_id && !info.peer_session_id) {
if (tunnel == NULL) {
struct l2tp_tunnel_cfg tcfg = {
.encap = L2TP_ENCAPTYPE_UDP,
@@ -707,12 +729,16 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
/* Prevent l2tp_tunnel_register() from trying to set up
* a kernel socket.
*/
- if (fd < 0) {
+ if (info.fd < 0) {
error = -EBADF;
goto end;
}
- error = l2tp_tunnel_create(sock_net(sk), fd, ver, tunnel_id, peer_tunnel_id, &tcfg, &tunnel);
+ error = l2tp_tunnel_create(sock_net(sk), info.fd,
+ info.version,
+ info.tunnel_id,
+ info.peer_tunnel_id, &tcfg,
+ &tunnel);
if (error < 0)
goto end;
@@ -741,9 +767,9 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
tunnel->recv_payload_hook = pppol2tp_recv_payload_hook;
if (tunnel->peer_tunnel_id == 0)
- tunnel->peer_tunnel_id = peer_tunnel_id;
+ tunnel->peer_tunnel_id = info.peer_tunnel_id;
- session = l2tp_session_get(sock_net(sk), tunnel, session_id);
+ session = l2tp_session_get(sock_net(sk), tunnel, info.session_id);
if (session) {
drop_refcnt = true;
@@ -772,8 +798,8 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
cfg.pw_type = L2TP_PWTYPE_PPP;
session = l2tp_session_create(sizeof(struct pppol2tp_session),
- tunnel, session_id,
- peer_session_id, &cfg);
+ tunnel, info.session_id,
+ info.peer_session_id, &cfg);
if (IS_ERR(session)) {
error = PTR_ERR(session);
goto end;
diff --git a/net/mac80211/Makefile b/net/mac80211/Makefile
index e3589ade62e0..bb707789ef2b 100644
--- a/net/mac80211/Makefile
+++ b/net/mac80211/Makefile
@@ -12,6 +12,7 @@ mac80211-y := \
scan.o offchannel.o \
ht.o agg-tx.o agg-rx.o \
vht.o \
+ he.o \
ibss.o \
iface.o \
rate.o \
diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c
index e83c19d4c292..6a4f154c99f6 100644
--- a/net/mac80211/agg-rx.c
+++ b/net/mac80211/agg-rx.c
@@ -245,6 +245,7 @@ void ___ieee80211_start_rx_ba_session(struct sta_info *sta,
};
int i, ret = -EOPNOTSUPP;
u16 status = WLAN_STATUS_REQUEST_DECLINED;
+ u16 max_buf_size;
if (tid >= IEEE80211_FIRST_TSPEC_TSID) {
ht_dbg(sta->sdata,
@@ -268,13 +269,18 @@ void ___ieee80211_start_rx_ba_session(struct sta_info *sta,
goto end;
}
+ if (sta->sta.he_cap.has_he)
+ max_buf_size = IEEE80211_MAX_AMPDU_BUF;
+ else
+ max_buf_size = IEEE80211_MAX_AMPDU_BUF_HT;
+
/* sanity check for incoming parameters:
* check if configuration can support the BA policy
* and if buffer size does not exceeds max value */
/* XXX: check own ht delayed BA capability?? */
if (((ba_policy != 1) &&
(!(sta->sta.ht_cap.cap & IEEE80211_HT_CAP_DELAY_BA))) ||
- (buf_size > IEEE80211_MAX_AMPDU_BUF)) {
+ (buf_size > max_buf_size)) {
status = WLAN_STATUS_INVALID_QOS_PARAM;
ht_dbg_ratelimited(sta->sdata,
"AddBA Req with bad params from %pM on tid %u. policy %d, buffer size %d\n",
@@ -283,7 +289,7 @@ void ___ieee80211_start_rx_ba_session(struct sta_info *sta,
}
/* determine default buffer size */
if (buf_size == 0)
- buf_size = IEEE80211_MAX_AMPDU_BUF;
+ buf_size = max_buf_size;
/* make sure the size doesn't exceed the maximum supported by the hw */
if (buf_size > sta->sta.max_rx_aggregation_subframes)
diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c
index ac4295296514..69e831bc317b 100644
--- a/net/mac80211/agg-tx.c
+++ b/net/mac80211/agg-tx.c
@@ -463,6 +463,7 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid)
.timeout = 0,
};
int ret;
+ u16 buf_size;
tid_tx = rcu_dereference_protected_tid_tx(sta, tid);
@@ -511,11 +512,22 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid)
sta->ampdu_mlme.addba_req_num[tid]++;
spin_unlock_bh(&sta->lock);
+ if (sta->sta.he_cap.has_he) {
+ buf_size = local->hw.max_tx_aggregation_subframes;
+ } else {
+ /*
+ * We really should use what the driver told us it will
+ * transmit as the maximum, but certain APs (e.g. the
+ * LinkSys WRT120N with FW v1.0.07 build 002 Jun 18 2012)
+ * will crash when we use a lower number.
+ */
+ buf_size = IEEE80211_MAX_AMPDU_BUF_HT;
+ }
+
/* send AddBA request */
ieee80211_send_addba_request(sdata, sta->sta.addr, tid,
tid_tx->dialog_token, params.ssn,
- IEEE80211_MAX_AMPDU_BUF,
- tid_tx->timeout);
+ buf_size, tid_tx->timeout);
}
/*
@@ -905,8 +917,7 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local,
{
struct tid_ampdu_tx *tid_tx;
struct ieee80211_txq *txq;
- u16 capab, tid;
- u8 buf_size;
+ u16 capab, tid, buf_size;
bool amsdu;
capab = le16_to_cpu(mgmt->u.action.u.addba_resp.capab);
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index bdf6fa78d0d2..02f3672e7b5e 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -1412,6 +1412,11 @@ static int sta_apply_parameters(struct ieee80211_local *local,
ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband,
params->vht_capa, sta);
+ if (params->he_capa)
+ ieee80211_he_cap_ie_to_sta_he_cap(sdata, sband,
+ (void *)params->he_capa,
+ params->he_capa_len, sta);
+
if (params->opmode_notif_used) {
/* returned value is only needed for rc update, but the
* rc isn't initialized here yet, so ignore it
@@ -3486,7 +3491,7 @@ static int ieee80211_probe_client(struct wiphy *wiphy, struct net_device *dev,
}
local_bh_disable();
- ieee80211_xmit(sdata, sta, skb);
+ ieee80211_xmit(sdata, sta, skb, 0);
local_bh_enable();
ret = 0;
diff --git a/net/mac80211/ethtool.c b/net/mac80211/ethtool.c
index 690c142a7a44..5ac743816b59 100644
--- a/net/mac80211/ethtool.c
+++ b/net/mac80211/ethtool.c
@@ -116,16 +116,16 @@ static void ieee80211_get_stats(struct net_device *dev,
data[i++] = sta->sta_state;
- if (sinfo.filled & BIT(NL80211_STA_INFO_TX_BITRATE))
+ if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_TX_BITRATE))
data[i] = 100000ULL *
cfg80211_calculate_bitrate(&sinfo.txrate);
i++;
- if (sinfo.filled & BIT(NL80211_STA_INFO_RX_BITRATE))
+ if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_RX_BITRATE))
data[i] = 100000ULL *
cfg80211_calculate_bitrate(&sinfo.rxrate);
i++;
- if (sinfo.filled & BIT(NL80211_STA_INFO_SIGNAL_AVG))
+ if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_SIGNAL_AVG))
data[i] = (u8)sinfo.signal_avg;
i++;
} else {
diff --git a/net/mac80211/he.c b/net/mac80211/he.c
new file mode 100644
index 000000000000..769078ed5a12
--- /dev/null
+++ b/net/mac80211/he.c
@@ -0,0 +1,55 @@
+/*
+ * HE handling
+ *
+ * Copyright(c) 2017 Intel Deutschland GmbH
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include "ieee80211_i.h"
+
+void
+ieee80211_he_cap_ie_to_sta_he_cap(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_supported_band *sband,
+ const u8 *he_cap_ie, u8 he_cap_len,
+ struct sta_info *sta)
+{
+ struct ieee80211_sta_he_cap *he_cap = &sta->sta.he_cap;
+ struct ieee80211_he_cap_elem *he_cap_ie_elem = (void *)he_cap_ie;
+ u8 he_ppe_size;
+ u8 mcs_nss_size;
+ u8 he_total_size;
+
+ memset(he_cap, 0, sizeof(*he_cap));
+
+ if (!he_cap_ie || !ieee80211_get_he_sta_cap(sband))
+ return;
+
+ /* Make sure size is OK */
+ mcs_nss_size = ieee80211_he_mcs_nss_size(he_cap_ie_elem);
+ he_ppe_size =
+ ieee80211_he_ppe_size(he_cap_ie[sizeof(he_cap->he_cap_elem) +
+ mcs_nss_size],
+ he_cap_ie_elem->phy_cap_info);
+ he_total_size = sizeof(he_cap->he_cap_elem) + mcs_nss_size +
+ he_ppe_size;
+ if (he_cap_len < he_total_size)
+ return;
+
+ memcpy(&he_cap->he_cap_elem, he_cap_ie, sizeof(he_cap->he_cap_elem));
+
+ /* HE Tx/Rx HE MCS NSS Support Field */
+ memcpy(&he_cap->he_mcs_nss_supp,
+ &he_cap_ie[sizeof(he_cap->he_cap_elem)], mcs_nss_size);
+
+ /* Check if there are (optional) PPE Thresholds */
+ if (he_cap->he_cap_elem.phy_cap_info[6] &
+ IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT)
+ memcpy(he_cap->ppe_thres,
+ &he_cap_ie[sizeof(he_cap->he_cap_elem) + mcs_nss_size],
+ he_ppe_size);
+
+ he_cap->has_he = true;
+}
diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c
index 26a7ba3b698f..f849ea814993 100644
--- a/net/mac80211/ht.c
+++ b/net/mac80211/ht.c
@@ -352,7 +352,7 @@ void ieee80211_ba_session_work(struct work_struct *work)
test_and_clear_bit(tid,
sta->ampdu_mlme.tid_rx_manage_offl))
___ieee80211_start_rx_ba_session(sta, 0, 0, 0, 1, tid,
- IEEE80211_MAX_AMPDU_BUF,
+ IEEE80211_MAX_AMPDU_BUF_HT,
false, true);
if (test_and_clear_bit(tid + IEEE80211_NUM_TIDS,
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index d1978aa1c15d..172aeae21ae9 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -165,6 +165,7 @@ typedef unsigned __bitwise ieee80211_tx_result;
#define TX_DROP ((__force ieee80211_tx_result) 1u)
#define TX_QUEUED ((__force ieee80211_tx_result) 2u)
+#define IEEE80211_TX_NO_SEQNO BIT(0)
#define IEEE80211_TX_UNICAST BIT(1)
#define IEEE80211_TX_PS_BUFFERED BIT(2)
@@ -364,6 +365,7 @@ enum ieee80211_sta_flags {
IEEE80211_STA_DISABLE_160MHZ = BIT(13),
IEEE80211_STA_DISABLE_WMM = BIT(14),
IEEE80211_STA_ENABLE_RRM = BIT(15),
+ IEEE80211_STA_DISABLE_HE = BIT(16),
};
struct ieee80211_mgd_auth_data {
@@ -1453,6 +1455,10 @@ struct ieee802_11_elems {
const struct ieee80211_vht_cap *vht_cap_elem;
const struct ieee80211_vht_operation *vht_operation;
const struct ieee80211_meshconf_ie *mesh_config;
+ const u8 *he_cap;
+ const struct ieee80211_he_operation *he_operation;
+ const struct ieee80211_mu_edca_param_set *mu_edca_param_set;
+ const u8 *uora_element;
const u8 *mesh_id;
const u8 *peering;
const __le16 *awake_window;
@@ -1482,6 +1488,7 @@ struct ieee802_11_elems {
u8 ext_supp_rates_len;
u8 wmm_info_len;
u8 wmm_param_len;
+ u8 he_cap_len;
u8 mesh_id_len;
u8 peering_len;
u8 preq_len;
@@ -1824,6 +1831,13 @@ void ieee80211_get_vht_mask_from_cap(__le16 vht_cap,
enum nl80211_chan_width
ieee80211_sta_rx_bw_to_chan_width(struct sta_info *sta);
+/* HE */
+void
+ieee80211_he_cap_ie_to_sta_he_cap(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_supported_band *sband,
+ const u8 *he_cap_ie, u8 he_cap_len,
+ struct sta_info *sta);
+
/* Spectrum management */
void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata,
struct ieee80211_mgmt *mgmt,
@@ -1880,19 +1894,20 @@ void ieee80211_regulatory_limit_wmm_params(struct ieee80211_sub_if_data *sdata,
void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata,
bool bss_notify, bool enable_qos);
void ieee80211_xmit(struct ieee80211_sub_if_data *sdata,
- struct sta_info *sta, struct sk_buff *skb);
+ struct sta_info *sta, struct sk_buff *skb,
+ u32 txdata_flags);
void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata,
struct sk_buff *skb, int tid,
- enum nl80211_band band);
+ enum nl80211_band band, u32 txdata_flags);
static inline void
ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata,
struct sk_buff *skb, int tid,
- enum nl80211_band band)
+ enum nl80211_band band, u32 txdata_flags)
{
rcu_read_lock();
- __ieee80211_tx_skb_tid_band(sdata, skb, tid, band);
+ __ieee80211_tx_skb_tid_band(sdata, skb, tid, band, txdata_flags);
rcu_read_unlock();
}
@@ -1910,7 +1925,7 @@ static inline void ieee80211_tx_skb_tid(struct ieee80211_sub_if_data *sdata,
}
__ieee80211_tx_skb_tid_band(sdata, skb, tid,
- chanctx_conf->def.chan->band);
+ chanctx_conf->def.chan->band, 0);
rcu_read_unlock();
}
@@ -2031,26 +2046,27 @@ void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata,
void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata,
const u8 *bssid, u16 stype, u16 reason,
bool send_frame, u8 *frame_buf);
+
+enum {
+ IEEE80211_PROBE_FLAG_DIRECTED = BIT(0),
+ IEEE80211_PROBE_FLAG_MIN_CONTENT = BIT(1),
+ IEEE80211_PROBE_FLAG_RANDOM_SN = BIT(2),
+};
+
int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer,
size_t buffer_len,
struct ieee80211_scan_ies *ie_desc,
const u8 *ie, size_t ie_len,
u8 bands_used, u32 *rate_masks,
- struct cfg80211_chan_def *chandef);
+ struct cfg80211_chan_def *chandef,
+ u32 flags);
struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata,
const u8 *src, const u8 *dst,
u32 ratemask,
struct ieee80211_channel *chan,
const u8 *ssid, size_t ssid_len,
const u8 *ie, size_t ie_len,
- bool directed);
-void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata,
- const u8 *src, const u8 *dst,
- const u8 *ssid, size_t ssid_len,
- const u8 *ie, size_t ie_len,
- u32 ratemask, bool directed, u32 tx_flags,
- struct ieee80211_channel *channel, bool scan);
-
+ u32 flags);
u32 ieee80211_sta_get_rates(struct ieee80211_sub_if_data *sdata,
struct ieee802_11_elems *elems,
enum nl80211_band band, u32 *basic_rates);
@@ -2073,6 +2089,9 @@ u8 *ieee80211_ie_build_vht_cap(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap,
u32 cap);
u8 *ieee80211_ie_build_vht_oper(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap,
const struct cfg80211_chan_def *chandef);
+u8 *ieee80211_ie_build_he_cap(u8 *pos,
+ const struct ieee80211_sta_he_cap *he_cap,
+ u8 *end);
int ieee80211_parse_bitrates(struct cfg80211_chan_def *chandef,
const struct ieee80211_supported_band *sband,
const u8 *srates, int srates_len, u32 *rates);
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 555e389b7dfa..5e6cf2cee965 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -1130,7 +1130,7 @@ static void ieee80211_uninit(struct net_device *dev)
static u16 ieee80211_netdev_select_queue(struct net_device *dev,
struct sk_buff *skb,
- void *accel_priv,
+ struct net_device *sb_dev,
select_queue_fallback_t fallback)
{
return ieee80211_select_queue(IEEE80211_DEV_TO_SUB_IF(dev), skb);
@@ -1176,7 +1176,7 @@ static const struct net_device_ops ieee80211_dataif_ops = {
static u16 ieee80211_monitor_select_queue(struct net_device *dev,
struct sk_buff *skb,
- void *accel_priv,
+ struct net_device *sb_dev,
select_queue_fallback_t fallback)
{
struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index fb73451ed85e..4fb2709cb527 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -3,6 +3,7 @@
* Copyright 2005-2006, Devicescape Software, Inc.
* Copyright 2006-2007 Jiri Benc <jbenc@suse.cz>
* Copyright 2013-2014 Intel Mobile Communications GmbH
+ * Copyright (C) 2017 Intel Deutschland GmbH
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -557,10 +558,19 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len,
wiphy_ext_feature_set(wiphy,
NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211);
- if (!ops->hw_scan)
+ if (!ops->hw_scan) {
wiphy->features |= NL80211_FEATURE_LOW_PRIORITY_SCAN |
NL80211_FEATURE_AP_SCAN;
-
+ /*
+ * if the driver behaves correctly using the probe request
+ * (template) from mac80211, then both of these should be
+ * supported even with hw scan - but let drivers opt in.
+ */
+ wiphy_ext_feature_set(wiphy,
+ NL80211_EXT_FEATURE_SCAN_RANDOM_SN);
+ wiphy_ext_feature_set(wiphy,
+ NL80211_EXT_FEATURE_SCAN_MIN_PREQ_CONTENT);
+ }
if (!ops->set_key)
wiphy->flags |= WIPHY_FLAG_IBSS_RSN;
@@ -588,8 +598,8 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len,
local->hw.queues = 1;
local->hw.max_rates = 1;
local->hw.max_report_rates = 0;
- local->hw.max_rx_aggregation_subframes = IEEE80211_MAX_AMPDU_BUF;
- local->hw.max_tx_aggregation_subframes = IEEE80211_MAX_AMPDU_BUF;
+ local->hw.max_rx_aggregation_subframes = IEEE80211_MAX_AMPDU_BUF_HT;
+ local->hw.max_tx_aggregation_subframes = IEEE80211_MAX_AMPDU_BUF_HT;
local->hw.offchannel_tx_hw_queue = IEEE80211_INVAL_HW_QUEUE;
local->hw.conf.long_frame_max_tx_count = wiphy->retry_long;
local->hw.conf.short_frame_max_tx_count = wiphy->retry_short;
@@ -816,7 +826,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
int result, i;
enum nl80211_band band;
int channels, max_bitrates;
- bool supp_ht, supp_vht;
+ bool supp_ht, supp_vht, supp_he;
netdev_features_t feature_whitelist;
struct cfg80211_chan_def dflt_chandef = {};
@@ -896,6 +906,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
max_bitrates = 0;
supp_ht = false;
supp_vht = false;
+ supp_he = false;
for (band = 0; band < NUM_NL80211_BANDS; band++) {
struct ieee80211_supported_band *sband;
@@ -922,6 +933,9 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
supp_ht = supp_ht || sband->ht_cap.ht_supported;
supp_vht = supp_vht || sband->vht_cap.vht_supported;
+ if (!supp_he)
+ supp_he = !!ieee80211_get_he_sta_cap(sband);
+
if (!sband->ht_cap.ht_supported)
continue;
@@ -1011,6 +1025,18 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
local->scan_ies_len +=
2 + sizeof(struct ieee80211_vht_cap);
+ /* HE cap element is variable in size - set len to allow max size */
+ /*
+ * TODO: 1 is added at the end of the calculation to accommodate for
+ * the temporary placing of the HE capabilities IE under EXT.
+ * Remove it once it is placed in the final place.
+ */
+ if (supp_he)
+ local->scan_ies_len +=
+ 2 + sizeof(struct ieee80211_he_cap_elem) +
+ sizeof(struct ieee80211_he_mcs_nss_supp) +
+ IEEE80211_HE_PPE_THRES_MAX_LEN + 1;
+
if (!local->ops->hw_scan) {
/* For hw_scan, driver needs to set these up. */
local->hw.wiphy->max_scan_ssids = 4;
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index a59187c016e0..7fb9957359a3 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -149,6 +149,7 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata,
struct ieee80211_channel *channel,
const struct ieee80211_ht_operation *ht_oper,
const struct ieee80211_vht_operation *vht_oper,
+ const struct ieee80211_he_operation *he_oper,
struct cfg80211_chan_def *chandef, bool tracking)
{
struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
@@ -207,7 +208,27 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata,
}
vht_chandef = *chandef;
- if (!ieee80211_chandef_vht_oper(vht_oper, &vht_chandef)) {
+ if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HE) && he_oper &&
+ (le32_to_cpu(he_oper->he_oper_params) &
+ IEEE80211_HE_OPERATION_VHT_OPER_INFO)) {
+ struct ieee80211_vht_operation he_oper_vht_cap;
+
+ /*
+ * Set only first 3 bytes (other 2 aren't used in
+ * ieee80211_chandef_vht_oper() anyway)
+ */
+ memcpy(&he_oper_vht_cap, he_oper->optional, 3);
+ he_oper_vht_cap.basic_mcs_set = cpu_to_le16(0);
+
+ if (!ieee80211_chandef_vht_oper(&he_oper_vht_cap,
+ &vht_chandef)) {
+ if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HE))
+ sdata_info(sdata,
+ "HE AP VHT information is invalid, disable HE\n");
+ ret = IEEE80211_STA_DISABLE_HE;
+ goto out;
+ }
+ } else if (!ieee80211_chandef_vht_oper(vht_oper, &vht_chandef)) {
if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT))
sdata_info(sdata,
"AP VHT information is invalid, disable VHT\n");
@@ -300,12 +321,14 @@ static int ieee80211_config_bw(struct ieee80211_sub_if_data *sdata,
const struct ieee80211_ht_cap *ht_cap,
const struct ieee80211_ht_operation *ht_oper,
const struct ieee80211_vht_operation *vht_oper,
+ const struct ieee80211_he_operation *he_oper,
const u8 *bssid, u32 *changed)
{
struct ieee80211_local *local = sdata->local;
struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
- struct ieee80211_supported_band *sband;
- struct ieee80211_channel *chan;
+ struct ieee80211_channel *chan = sdata->vif.bss_conf.chandef.chan;
+ struct ieee80211_supported_band *sband =
+ local->hw.wiphy->bands[chan->band];
struct cfg80211_chan_def chandef;
u16 ht_opmode;
u32 flags;
@@ -320,6 +343,11 @@ static int ieee80211_config_bw(struct ieee80211_sub_if_data *sdata,
if (ifmgd->flags & IEEE80211_STA_DISABLE_VHT)
vht_oper = NULL;
+ /* don't check HE if we associated as non-HE station */
+ if (ifmgd->flags & IEEE80211_STA_DISABLE_HE ||
+ !ieee80211_get_he_sta_cap(sband))
+ he_oper = NULL;
+
if (WARN_ON_ONCE(!sta))
return -EINVAL;
@@ -333,12 +361,9 @@ static int ieee80211_config_bw(struct ieee80211_sub_if_data *sdata,
sdata->vif.bss_conf.ht_operation_mode = ht_opmode;
}
- chan = sdata->vif.bss_conf.chandef.chan;
- sband = local->hw.wiphy->bands[chan->band];
-
- /* calculate new channel (type) based on HT/VHT operation IEs */
+ /* calculate new channel (type) based on HT/VHT/HE operation IEs */
flags = ieee80211_determine_chantype(sdata, sband, chan,
- ht_oper, vht_oper,
+ ht_oper, vht_oper, he_oper,
&chandef, true);
/*
@@ -582,6 +607,34 @@ static void ieee80211_add_vht_ie(struct ieee80211_sub_if_data *sdata,
ieee80211_ie_build_vht_cap(pos, &vht_cap, cap);
}
+/* This function determines HE capability flags for the association
+ * and builds the IE.
+ */
+static void ieee80211_add_he_ie(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb,
+ struct ieee80211_supported_band *sband)
+{
+ u8 *pos;
+ const struct ieee80211_sta_he_cap *he_cap = NULL;
+ u8 he_cap_size;
+
+ he_cap = ieee80211_get_he_sta_cap(sband);
+ if (!he_cap)
+ return;
+
+ /*
+ * TODO: the 1 added is because this temporarily is under the EXTENSION
+ * IE. Get rid of it when it moves.
+ */
+ he_cap_size =
+ 2 + 1 + sizeof(he_cap->he_cap_elem) +
+ ieee80211_he_mcs_nss_size(&he_cap->he_cap_elem) +
+ ieee80211_he_ppe_size(he_cap->ppe_thres[0],
+ he_cap->he_cap_elem.phy_cap_info);
+ pos = skb_put(skb, he_cap_size);
+ ieee80211_ie_build_he_cap(pos, he_cap, pos + he_cap_size);
+}
+
static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata)
{
struct ieee80211_local *local = sdata->local;
@@ -643,6 +696,9 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata)
2 + 2 * sband->n_channels + /* supported channels */
2 + sizeof(struct ieee80211_ht_cap) + /* HT */
2 + sizeof(struct ieee80211_vht_cap) + /* VHT */
+ 2 + 1 + sizeof(struct ieee80211_he_cap_elem) + /* HE */
+ sizeof(struct ieee80211_he_mcs_nss_supp) +
+ IEEE80211_HE_PPE_THRES_MAX_LEN +
assoc_data->ie_len + /* extra IEs */
(assoc_data->fils_kek_len ? 16 /* AES-SIV */ : 0) +
9, /* WMM */
@@ -827,11 +883,41 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata)
offset = noffset;
}
+ /* if present, add any custom IEs that go before HE */
+ if (assoc_data->ie_len) {
+ static const u8 before_he[] = {
+ /*
+ * no need to list the ones split off before VHT
+ * or generated here
+ */
+ WLAN_EID_OPMODE_NOTIF,
+ WLAN_EID_EXTENSION, WLAN_EID_EXT_FUTURE_CHAN_GUIDANCE,
+ /* 11ai elements */
+ WLAN_EID_EXTENSION, WLAN_EID_EXT_FILS_SESSION,
+ WLAN_EID_EXTENSION, WLAN_EID_EXT_FILS_PUBLIC_KEY,
+ WLAN_EID_EXTENSION, WLAN_EID_EXT_FILS_KEY_CONFIRM,
+ WLAN_EID_EXTENSION, WLAN_EID_EXT_FILS_HLP_CONTAINER,
+ WLAN_EID_EXTENSION, WLAN_EID_EXT_FILS_IP_ADDR_ASSIGN,
+ /* TODO: add 11ah/11aj/11ak elements */
+ };
+
+ /* RIC already taken above, so no need to handle here anymore */
+ noffset = ieee80211_ie_split(assoc_data->ie, assoc_data->ie_len,
+ before_he, ARRAY_SIZE(before_he),
+ offset);
+ pos = skb_put(skb, noffset - offset);
+ memcpy(pos, assoc_data->ie + offset, noffset - offset);
+ offset = noffset;
+ }
+
if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT))
ieee80211_add_vht_ie(sdata, skb, sband,
&assoc_data->ap_vht_cap);
- /* if present, add any custom non-vendor IEs that go after HT */
+ if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HE))
+ ieee80211_add_he_ie(sdata, skb, sband);
+
+ /* if present, add any custom non-vendor IEs that go after HE */
if (assoc_data->ie_len) {
noffset = ieee80211_ie_split_vendor(assoc_data->ie,
assoc_data->ie_len,
@@ -898,6 +984,11 @@ void ieee80211_send_nullfunc(struct ieee80211_local *local,
struct ieee80211_hdr_3addr *nullfunc;
struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+ /* Don't send NDPs when STA is connected HE */
+ if (sdata->vif.type == NL80211_IFTYPE_STATION &&
+ !(ifmgd->flags & IEEE80211_STA_DISABLE_HE))
+ return;
+
skb = ieee80211_nullfunc_get(&local->hw, &sdata->vif,
!ieee80211_hw_check(&local->hw, DOESNT_SUPPORT_QOS_NDP));
if (!skb)
@@ -929,6 +1020,10 @@ static void ieee80211_send_4addr_nullfunc(struct ieee80211_local *local,
if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION))
return;
+ /* Don't send NDPs when connected HE */
+ if (!(sdata->u.mgd.flags & IEEE80211_STA_DISABLE_HE))
+ return;
+
skb = dev_alloc_skb(local->hw.extra_tx_headroom + 30);
if (!skb)
return;
@@ -1700,9 +1795,11 @@ static void ieee80211_sta_handle_tspec_ac_params_wk(struct work_struct *work)
}
/* MLME */
-static bool ieee80211_sta_wmm_params(struct ieee80211_local *local,
- struct ieee80211_sub_if_data *sdata,
- const u8 *wmm_param, size_t wmm_param_len)
+static bool
+ieee80211_sta_wmm_params(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ const u8 *wmm_param, size_t wmm_param_len,
+ const struct ieee80211_mu_edca_param_set *mu_edca)
{
struct ieee80211_tx_queue_params params[IEEE80211_NUM_ACS];
struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
@@ -1749,6 +1846,9 @@ static bool ieee80211_sta_wmm_params(struct ieee80211_local *local,
sdata->wmm_acm |= BIT(1) | BIT(2); /* BK/- */
if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_BK)
uapsd = true;
+ params[ac].mu_edca = !!mu_edca;
+ if (mu_edca)
+ params[ac].mu_edca_param_rec = mu_edca->ac_bk;
break;
case 2: /* AC_VI */
ac = IEEE80211_AC_VI;
@@ -1756,6 +1856,9 @@ static bool ieee80211_sta_wmm_params(struct ieee80211_local *local,
sdata->wmm_acm |= BIT(4) | BIT(5); /* CL/VI */
if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_VI)
uapsd = true;
+ params[ac].mu_edca = !!mu_edca;
+ if (mu_edca)
+ params[ac].mu_edca_param_rec = mu_edca->ac_vi;
break;
case 3: /* AC_VO */
ac = IEEE80211_AC_VO;
@@ -1763,6 +1866,9 @@ static bool ieee80211_sta_wmm_params(struct ieee80211_local *local,
sdata->wmm_acm |= BIT(6) | BIT(7); /* VO/NC */
if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_VO)
uapsd = true;
+ params[ac].mu_edca = !!mu_edca;
+ if (mu_edca)
+ params[ac].mu_edca_param_rec = mu_edca->ac_vo;
break;
case 0: /* AC_BE */
default:
@@ -1771,6 +1877,9 @@ static bool ieee80211_sta_wmm_params(struct ieee80211_local *local,
sdata->wmm_acm |= BIT(0) | BIT(3); /* BE/EE */
if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_BE)
uapsd = true;
+ params[ac].mu_edca = !!mu_edca;
+ if (mu_edca)
+ params[ac].mu_edca_param_rec = mu_edca->ac_be;
break;
}
@@ -2219,6 +2328,20 @@ void ieee80211_sta_tx_notify(struct ieee80211_sub_if_data *sdata,
ieee80211_sta_reset_conn_monitor(sdata);
}
+static void ieee80211_mlme_send_probe_req(struct ieee80211_sub_if_data *sdata,
+ const u8 *src, const u8 *dst,
+ const u8 *ssid, size_t ssid_len,
+ struct ieee80211_channel *channel)
+{
+ struct sk_buff *skb;
+
+ skb = ieee80211_build_probe_req(sdata, src, dst, (u32)-1, channel,
+ ssid, ssid_len, NULL, 0,
+ IEEE80211_PROBE_FLAG_DIRECTED);
+ if (skb)
+ ieee80211_tx_skb(sdata, skb);
+}
+
static void ieee80211_mgd_probe_ap_send(struct ieee80211_sub_if_data *sdata)
{
struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
@@ -2265,10 +2388,9 @@ static void ieee80211_mgd_probe_ap_send(struct ieee80211_sub_if_data *sdata)
else
ssid_len = ssid[1];
- ieee80211_send_probe_req(sdata, sdata->vif.addr, dst,
- ssid + 2, ssid_len, NULL,
- 0, (u32) -1, true, 0,
- ifmgd->associated->channel, false);
+ ieee80211_mlme_send_probe_req(sdata, sdata->vif.addr, dst,
+ ssid + 2, ssid_len,
+ ifmgd->associated->channel);
rcu_read_unlock();
}
@@ -2370,7 +2492,7 @@ struct sk_buff *ieee80211_ap_probereq_get(struct ieee80211_hw *hw,
skb = ieee80211_build_probe_req(sdata, sdata->vif.addr, cbss->bssid,
(u32) -1, cbss->channel,
ssid + 2, ssid_len,
- NULL, 0, true);
+ NULL, 0, IEEE80211_PROBE_FLAG_DIRECTED);
rcu_read_unlock();
return skb;
@@ -3008,6 +3130,25 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
goto out;
}
+ /*
+ * If AP doesn't support HT, or it doesn't have HE mandatory IEs, mark
+ * HE as disabled. If on the 5GHz band, make sure it supports VHT.
+ */
+ if (ifmgd->flags & IEEE80211_STA_DISABLE_HT ||
+ (sband->band == NL80211_BAND_5GHZ &&
+ ifmgd->flags & IEEE80211_STA_DISABLE_VHT) ||
+ (!elems.he_cap && !elems.he_operation))
+ ifmgd->flags |= IEEE80211_STA_DISABLE_HE;
+
+ if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HE) &&
+ (!elems.he_cap || !elems.he_operation)) {
+ mutex_unlock(&sdata->local->sta_mtx);
+ sdata_info(sdata,
+ "HE AP is missing HE capability/operation\n");
+ ret = false;
+ goto out;
+ }
+
/* Set up internal HT/VHT capabilities */
if (elems.ht_cap_elem && !(ifmgd->flags & IEEE80211_STA_DISABLE_HT))
ieee80211_ht_cap_ie_to_sta_ht_cap(sdata, sband,
@@ -3017,6 +3158,48 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband,
elems.vht_cap_elem, sta);
+ if (elems.he_operation && !(ifmgd->flags & IEEE80211_STA_DISABLE_HE) &&
+ elems.he_cap) {
+ ieee80211_he_cap_ie_to_sta_he_cap(sdata, sband,
+ elems.he_cap,
+ elems.he_cap_len,
+ sta);
+
+ bss_conf->he_support = sta->sta.he_cap.has_he;
+ } else {
+ bss_conf->he_support = false;
+ }
+
+ if (bss_conf->he_support) {
+ u32 he_oper_params =
+ le32_to_cpu(elems.he_operation->he_oper_params);
+
+ bss_conf->bss_color = he_oper_params &
+ IEEE80211_HE_OPERATION_BSS_COLOR_MASK;
+ bss_conf->htc_trig_based_pkt_ext =
+ (he_oper_params &
+ IEEE80211_HE_OPERATION_DFLT_PE_DURATION_MASK) <<
+ IEEE80211_HE_OPERATION_DFLT_PE_DURATION_OFFSET;
+ bss_conf->frame_time_rts_th =
+ (he_oper_params &
+ IEEE80211_HE_OPERATION_RTS_THRESHOLD_MASK) <<
+ IEEE80211_HE_OPERATION_RTS_THRESHOLD_OFFSET;
+
+ bss_conf->multi_sta_back_32bit =
+ sta->sta.he_cap.he_cap_elem.mac_cap_info[2] &
+ IEEE80211_HE_MAC_CAP2_32BIT_BA_BITMAP;
+
+ bss_conf->ack_enabled =
+ sta->sta.he_cap.he_cap_elem.mac_cap_info[2] &
+ IEEE80211_HE_MAC_CAP2_ACK_EN;
+
+ bss_conf->uora_exists = !!elems.uora_element;
+ if (elems.uora_element)
+ bss_conf->uora_ocw_range = elems.uora_element[0];
+
+ /* TODO: OPEN: what happens if BSS color disable is set? */
+ }
+
/*
* Some APs, e.g. Netgear WNDR3700, report invalid HT operation data
* in their association response, so ignore that data for our own
@@ -3076,7 +3259,8 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
if (ifmgd->flags & IEEE80211_STA_DISABLE_WMM) {
ieee80211_set_wmm_default(sdata, false, false);
} else if (!ieee80211_sta_wmm_params(local, sdata, elems.wmm_param,
- elems.wmm_param_len)) {
+ elems.wmm_param_len,
+ elems.mu_edca_param_set)) {
/* still enable QoS since we might have HT/VHT */
ieee80211_set_wmm_default(sdata, false, true);
/* set the disable-WMM flag in this case to disable
@@ -3590,7 +3774,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
if (!(ifmgd->flags & IEEE80211_STA_DISABLE_WMM) &&
ieee80211_sta_wmm_params(local, sdata, elems.wmm_param,
- elems.wmm_param_len))
+ elems.wmm_param_len,
+ elems.mu_edca_param_set))
changed |= BSS_CHANGED_QOS;
/*
@@ -3629,7 +3814,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
if (ieee80211_config_bw(sdata, sta,
elems.ht_cap_elem, elems.ht_operation,
- elems.vht_operation, bssid, &changed)) {
+ elems.vht_operation, elems.he_operation,
+ bssid, &changed)) {
mutex_unlock(&local->sta_mtx);
sdata_info(sdata,
"failed to follow AP %pM bandwidth change, disconnect\n",
@@ -4266,6 +4452,68 @@ static u8 ieee80211_ht_vht_rx_chains(struct ieee80211_sub_if_data *sdata,
return chains;
}
+static bool
+ieee80211_verify_sta_he_mcs_support(struct ieee80211_supported_band *sband,
+ const struct ieee80211_he_operation *he_op)
+{
+ const struct ieee80211_sta_he_cap *sta_he_cap =
+ ieee80211_get_he_sta_cap(sband);
+ u16 ap_min_req_set;
+ int i;
+
+ if (!sta_he_cap || !he_op)
+ return false;
+
+ ap_min_req_set = le16_to_cpu(he_op->he_mcs_nss_set);
+
+ /* Need to go over for 80MHz, 160MHz and for 80+80 */
+ for (i = 0; i < 3; i++) {
+ const struct ieee80211_he_mcs_nss_supp *sta_mcs_nss_supp =
+ &sta_he_cap->he_mcs_nss_supp;
+ u16 sta_mcs_map_rx =
+ le16_to_cpu(((__le16 *)sta_mcs_nss_supp)[2 * i]);
+ u16 sta_mcs_map_tx =
+ le16_to_cpu(((__le16 *)sta_mcs_nss_supp)[2 * i + 1]);
+ u8 nss;
+ bool verified = true;
+
+ /*
+ * For each band there is a maximum of 8 spatial streams
+ * possible. Each of the sta_mcs_map_* is a 16-bit struct built
+ * of 2 bits per NSS (1-8), with the values defined in enum
+ * ieee80211_he_mcs_support. Need to make sure STA TX and RX
+ * capabilities aren't less than the AP's minimum requirements
+ * for this HE BSS per SS.
+ * It is enough to find one such band that meets the reqs.
+ */
+ for (nss = 8; nss > 0; nss--) {
+ u8 sta_rx_val = (sta_mcs_map_rx >> (2 * (nss - 1))) & 3;
+ u8 sta_tx_val = (sta_mcs_map_tx >> (2 * (nss - 1))) & 3;
+ u8 ap_val = (ap_min_req_set >> (2 * (nss - 1))) & 3;
+
+ if (ap_val == IEEE80211_HE_MCS_NOT_SUPPORTED)
+ continue;
+
+ /*
+ * Make sure the HE AP doesn't require MCSs that aren't
+ * supported by the client
+ */
+ if (sta_rx_val == IEEE80211_HE_MCS_NOT_SUPPORTED ||
+ sta_tx_val == IEEE80211_HE_MCS_NOT_SUPPORTED ||
+ (ap_val > sta_rx_val) || (ap_val > sta_tx_val)) {
+ verified = false;
+ break;
+ }
+ }
+
+ if (verified)
+ return true;
+ }
+
+ /* If here, STA doesn't meet AP's HE min requirements */
+ return false;
+}
+
static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata,
struct cfg80211_bss *cbss)
{
@@ -4274,6 +4522,7 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata,
const struct ieee80211_ht_cap *ht_cap = NULL;
const struct ieee80211_ht_operation *ht_oper = NULL;
const struct ieee80211_vht_operation *vht_oper = NULL;
+ const struct ieee80211_he_operation *he_oper = NULL;
struct ieee80211_supported_band *sband;
struct cfg80211_chan_def chandef;
int ret;
@@ -4329,6 +4578,24 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata,
}
}
+ if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HE) &&
+ ieee80211_get_he_sta_cap(sband)) {
+ const struct cfg80211_bss_ies *ies;
+ const u8 *he_oper_ie;
+
+ ies = rcu_dereference(cbss->ies);
+ he_oper_ie = cfg80211_find_ext_ie(WLAN_EID_EXT_HE_OPERATION,
+ ies->data, ies->len);
+ if (he_oper_ie &&
+ he_oper_ie[1] == ieee80211_he_oper_size(&he_oper_ie[3]))
+ he_oper = (void *)(he_oper_ie + 3);
+ else
+ he_oper = NULL;
+
+ if (!ieee80211_verify_sta_he_mcs_support(sband, he_oper))
+ ifmgd->flags |= IEEE80211_STA_DISABLE_HE;
+ }
+
/* Allow VHT if at least one channel on the sband supports 80 MHz */
have_80mhz = false;
for (i = 0; i < sband->n_channels; i++) {
@@ -4345,7 +4612,7 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata,
ifmgd->flags |= ieee80211_determine_chantype(sdata, sband,
cbss->channel,
- ht_oper, vht_oper,
+ ht_oper, vht_oper, he_oper,
&chandef, false);
sdata->needed_rx_chains = min(ieee80211_ht_vht_rx_chains(sdata, cbss),
@@ -4751,8 +5018,9 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata,
req->crypto.ciphers_pairwise[i] == WLAN_CIPHER_SUITE_WEP104) {
ifmgd->flags |= IEEE80211_STA_DISABLE_HT;
ifmgd->flags |= IEEE80211_STA_DISABLE_VHT;
+ ifmgd->flags |= IEEE80211_STA_DISABLE_HE;
netdev_info(sdata->dev,
- "disabling HT/VHT due to WEP/TKIP use\n");
+ "disabling HE/HT/VHT due to WEP/TKIP use\n");
}
}
diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c
index f1d40b6645ff..8ef4153cd299 100644
--- a/net/mac80211/offchannel.c
+++ b/net/mac80211/offchannel.c
@@ -262,7 +262,7 @@ static void ieee80211_handle_roc_started(struct ieee80211_roc_work *roc,
if (roc->mgmt_tx_cookie) {
if (!WARN_ON(!roc->frame)) {
ieee80211_tx_skb_tid_band(roc->sdata, roc->frame, 7,
- roc->chan->band);
+ roc->chan->band, 0);
roc->frame = NULL;
}
} else {
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 0a38cc1cbebc..a16ba568e2a3 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -175,6 +175,20 @@ ieee80211_rx_radiotap_hdrlen(struct ieee80211_local *local,
len += 12;
}
+ if (status->encoding == RX_ENC_HE &&
+ status->flag & RX_FLAG_RADIOTAP_HE) {
+ len = ALIGN(len, 2);
+ len += 12;
+ BUILD_BUG_ON(sizeof(struct ieee80211_radiotap_he) != 12);
+ }
+
+ if (status->encoding == RX_ENC_HE &&
+ status->flag & RX_FLAG_RADIOTAP_HE_MU) {
+ len = ALIGN(len, 2);
+ len += 12;
+ BUILD_BUG_ON(sizeof(struct ieee80211_radiotap_he_mu) != 12);
+ }
+
if (status->chains) {
/* antenna and antenna signal fields */
len += 2 * hweight8(status->chains);
@@ -263,6 +277,19 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local,
int mpdulen, chain;
unsigned long chains = status->chains;
struct ieee80211_vendor_radiotap rtap = {};
+ struct ieee80211_radiotap_he he = {};
+ struct ieee80211_radiotap_he_mu he_mu = {};
+
+ if (status->flag & RX_FLAG_RADIOTAP_HE) {
+ he = *(struct ieee80211_radiotap_he *)skb->data;
+ skb_pull(skb, sizeof(he));
+ WARN_ON_ONCE(status->encoding != RX_ENC_HE);
+ }
+
+ if (status->flag & RX_FLAG_RADIOTAP_HE_MU) {
+ he_mu = *(struct ieee80211_radiotap_he_mu *)skb->data;
+ skb_pull(skb, sizeof(he_mu));
+ }
if (status->flag & RX_FLAG_RADIOTAP_VENDOR_DATA) {
rtap = *(struct ieee80211_vendor_radiotap *)skb->data;
@@ -520,6 +547,89 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local,
*pos++ = flags;
}
+ if (status->encoding == RX_ENC_HE &&
+ status->flag & RX_FLAG_RADIOTAP_HE) {
+#define HE_PREP(f, val) cpu_to_le16(FIELD_PREP(IEEE80211_RADIOTAP_HE_##f, val))
+
+ if (status->enc_flags & RX_ENC_FLAG_STBC_MASK) {
+ he.data6 |= HE_PREP(DATA6_NSTS,
+ FIELD_GET(RX_ENC_FLAG_STBC_MASK,
+ status->enc_flags));
+ he.data3 |= HE_PREP(DATA3_STBC, 1);
+ } else {
+ he.data6 |= HE_PREP(DATA6_NSTS, status->nss);
+ }
+
+#define CHECK_GI(s) \
+ BUILD_BUG_ON(IEEE80211_RADIOTAP_HE_DATA5_GI_##s != \
+ (int)NL80211_RATE_INFO_HE_GI_##s)
+
+ CHECK_GI(0_8);
+ CHECK_GI(1_6);
+ CHECK_GI(3_2);
+
+ he.data3 |= HE_PREP(DATA3_DATA_MCS, status->rate_idx);
+ he.data3 |= HE_PREP(DATA3_DATA_DCM, status->he_dcm);
+ he.data3 |= HE_PREP(DATA3_CODING,
+ !!(status->enc_flags & RX_ENC_FLAG_LDPC));
+
+ he.data5 |= HE_PREP(DATA5_GI, status->he_gi);
+
+ switch (status->bw) {
+ case RATE_INFO_BW_20:
+ he.data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC,
+ IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_20MHZ);
+ break;
+ case RATE_INFO_BW_40:
+ he.data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC,
+ IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_40MHZ);
+ break;
+ case RATE_INFO_BW_80:
+ he.data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC,
+ IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_80MHZ);
+ break;
+ case RATE_INFO_BW_160:
+ he.data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC,
+ IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_160MHZ);
+ break;
+ case RATE_INFO_BW_HE_RU:
+#define CHECK_RU_ALLOC(s) \
+ BUILD_BUG_ON(IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_##s##T != \
+ NL80211_RATE_INFO_HE_RU_ALLOC_##s + 4)
+
+ CHECK_RU_ALLOC(26);
+ CHECK_RU_ALLOC(52);
+ CHECK_RU_ALLOC(106);
+ CHECK_RU_ALLOC(242);
+ CHECK_RU_ALLOC(484);
+ CHECK_RU_ALLOC(996);
+ CHECK_RU_ALLOC(2x996);
+
+ he.data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC,
+ status->he_ru + 4);
+ break;
+ default:
+ WARN_ONCE(1, "Invalid SU BW %d\n", status->bw);
+ }
+
+ /* ensure 2 byte alignment */
+ while ((pos - (u8 *)rthdr) & 1)
+ pos++;
+ rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_HE);
+ memcpy(pos, &he, sizeof(he));
+ pos += sizeof(he);
+ }
+
+ if (status->encoding == RX_ENC_HE &&
+ status->flag & RX_FLAG_RADIOTAP_HE_MU) {
+ /* ensure 2 byte alignment */
+ while ((pos - (u8 *)rthdr) & 1)
+ pos++;
+ rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_HE_MU);
+ memcpy(pos, &he_mu, sizeof(he_mu));
+ pos += sizeof(he_mu);
+ }
+
for_each_set_bit(chain, &chains, IEEE80211_MAX_CHAINS) {
*pos++ = status->chain_signal[chain];
*pos++ = chain;
@@ -613,6 +723,12 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
rcu_dereference(local->monitor_sdata);
bool only_monitor = false;
+ if (status->flag & RX_FLAG_RADIOTAP_HE)
+ rtap_space += sizeof(struct ieee80211_radiotap_he);
+
+ if (status->flag & RX_FLAG_RADIOTAP_HE_MU)
+ rtap_space += sizeof(struct ieee80211_radiotap_he_mu);
+
if (unlikely(status->flag & RX_FLAG_RADIOTAP_VENDOR_DATA)) {
struct ieee80211_vendor_radiotap *rtap = (void *)origskb->data;
@@ -3241,7 +3357,7 @@ ieee80211_rx_h_action_return(struct ieee80211_rx_data *rx)
}
__ieee80211_tx_skb_tid_band(rx->sdata, nskb, 7,
- status->band);
+ status->band, 0);
}
dev_kfree_skb(rx->skb);
return RX_QUEUED;
@@ -3386,8 +3502,7 @@ static void ieee80211_rx_handlers_result(struct ieee80211_rx_data *rx,
status = IEEE80211_SKB_RXCB((rx->skb));
sband = rx->local->hw.wiphy->bands[status->band];
- if (!(status->encoding == RX_ENC_HT) &&
- !(status->encoding == RX_ENC_VHT))
+ if (status->encoding == RX_ENC_LEGACY)
rate = &sband->bitrates[status->rate_idx];
ieee80211_rx_cooked_monitor(rx, rate);
@@ -4386,6 +4501,14 @@ void ieee80211_rx_napi(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta,
status->rate_idx, status->nss))
goto drop;
break;
+ case RX_ENC_HE:
+ if (WARN_ONCE(status->rate_idx > 11 ||
+ !status->nss ||
+ status->nss > 8,
+ "Rate marked as an HE rate but data is invalid: MCS: %d, NSS: %d\n",
+ status->rate_idx, status->nss))
+ goto drop;
+ break;
default:
WARN_ON_ONCE(1);
/* fall through */
diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c
index 2e917a6d239d..5d2a11777718 100644
--- a/net/mac80211/scan.c
+++ b/net/mac80211/scan.c
@@ -20,6 +20,7 @@
#include <net/sch_generic.h>
#include <linux/slab.h>
#include <linux/export.h>
+#include <linux/random.h>
#include <net/mac80211.h>
#include "ieee80211_i.h"
@@ -293,6 +294,7 @@ static bool ieee80211_prep_hw_scan(struct ieee80211_local *local)
struct cfg80211_chan_def chandef;
u8 bands_used = 0;
int i, ielen, n_chans;
+ u32 flags = 0;
req = rcu_dereference_protected(local->scan_req,
lockdep_is_held(&local->mtx));
@@ -331,12 +333,16 @@ static bool ieee80211_prep_hw_scan(struct ieee80211_local *local)
local->hw_scan_req->req.n_channels = n_chans;
ieee80211_prepare_scan_chandef(&chandef, req->scan_width);
+ if (req->flags & NL80211_SCAN_FLAG_MIN_PREQ_CONTENT)
+ flags |= IEEE80211_PROBE_FLAG_MIN_CONTENT;
+
ielen = ieee80211_build_preq_ies(local,
(u8 *)local->hw_scan_req->req.ie,
local->hw_scan_ies_bufsize,
&local->hw_scan_req->ies,
req->ie, req->ie_len,
- bands_used, req->rates, &chandef);
+ bands_used, req->rates, &chandef,
+ flags);
local->hw_scan_req->req.ie_len = ielen;
local->hw_scan_req->req.no_cck = req->no_cck;
ether_addr_copy(local->hw_scan_req->req.mac_addr, req->mac_addr);
@@ -528,6 +534,35 @@ void ieee80211_run_deferred_scan(struct ieee80211_local *local)
round_jiffies_relative(0));
}
+static void ieee80211_send_scan_probe_req(struct ieee80211_sub_if_data *sdata,
+ const u8 *src, const u8 *dst,
+ const u8 *ssid, size_t ssid_len,
+ const u8 *ie, size_t ie_len,
+ u32 ratemask, u32 flags, u32 tx_flags,
+ struct ieee80211_channel *channel)
+{
+ struct sk_buff *skb;
+ u32 txdata_flags = 0;
+
+ skb = ieee80211_build_probe_req(sdata, src, dst, ratemask, channel,
+ ssid, ssid_len,
+ ie, ie_len, flags);
+
+ if (skb) {
+ if (flags & IEEE80211_PROBE_FLAG_RANDOM_SN) {
+ struct ieee80211_hdr *hdr = (void *)skb->data;
+ u16 sn = get_random_u32();
+
+ txdata_flags |= IEEE80211_TX_NO_SEQNO;
+ hdr->seq_ctrl =
+ cpu_to_le16(IEEE80211_SN_TO_SEQ(sn));
+ }
+ IEEE80211_SKB_CB(skb)->flags |= tx_flags;
+ ieee80211_tx_skb_tid_band(sdata, skb, 7, channel->band,
+ txdata_flags);
+ }
+}
+
static void ieee80211_scan_state_send_probe(struct ieee80211_local *local,
unsigned long *next_delay)
{
@@ -535,7 +570,7 @@ static void ieee80211_scan_state_send_probe(struct ieee80211_local *local,
struct ieee80211_sub_if_data *sdata;
struct cfg80211_scan_request *scan_req;
enum nl80211_band band = local->hw.conf.chandef.chan->band;
- u32 tx_flags;
+ u32 flags = 0, tx_flags;
scan_req = rcu_dereference_protected(local->scan_req,
lockdep_is_held(&local->mtx));
@@ -543,17 +578,21 @@ static void ieee80211_scan_state_send_probe(struct ieee80211_local *local,
tx_flags = IEEE80211_TX_INTFL_OFFCHAN_TX_OK;
if (scan_req->no_cck)
tx_flags |= IEEE80211_TX_CTL_NO_CCK_RATE;
+ if (scan_req->flags & NL80211_SCAN_FLAG_MIN_PREQ_CONTENT)
+ flags |= IEEE80211_PROBE_FLAG_MIN_CONTENT;
+ if (scan_req->flags & NL80211_SCAN_FLAG_RANDOM_SN)
+ flags |= IEEE80211_PROBE_FLAG_RANDOM_SN;
sdata = rcu_dereference_protected(local->scan_sdata,
lockdep_is_held(&local->mtx));
for (i = 0; i < scan_req->n_ssids; i++)
- ieee80211_send_probe_req(
+ ieee80211_send_scan_probe_req(
sdata, local->scan_addr, scan_req->bssid,
scan_req->ssids[i].ssid, scan_req->ssids[i].ssid_len,
scan_req->ie, scan_req->ie_len,
- scan_req->rates[band], false,
- tx_flags, local->hw.conf.chandef.chan, true);
+ scan_req->rates[band], flags,
+ tx_flags, local->hw.conf.chandef.chan);
/*
* After sending probe requests, wait for probe responses
@@ -1141,6 +1180,7 @@ int __ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata,
u32 rate_masks[NUM_NL80211_BANDS] = {};
u8 bands_used = 0;
u8 *ie;
+ u32 flags = 0;
iebufsz = local->scan_ies_len + req->ie_len;
@@ -1157,6 +1197,9 @@ int __ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata,
}
}
+ if (req->flags & NL80211_SCAN_FLAG_MIN_PREQ_CONTENT)
+ flags |= IEEE80211_PROBE_FLAG_MIN_CONTENT;
+
ie = kcalloc(iebufsz, num_bands, GFP_KERNEL);
if (!ie) {
ret = -ENOMEM;
@@ -1167,7 +1210,8 @@ int __ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata,
ieee80211_build_preq_ies(local, ie, num_bands * iebufsz,
&sched_scan_ies, req->ie,
- req->ie_len, bands_used, rate_masks, &chandef);
+ req->ie_len, bands_used, rate_masks, &chandef,
+ flags);
ret = drv_sched_scan_start(local, sdata, req, &sched_scan_ies);
if (ret == 0) {
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 6428f1ac37b6..f34202242d24 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -1323,6 +1323,11 @@ static void ieee80211_send_null_response(struct sta_info *sta, int tid,
struct ieee80211_tx_info *info;
struct ieee80211_chanctx_conf *chanctx_conf;
+ /* Don't send NDPs when STA is connected HE */
+ if (sdata->vif.type == NL80211_IFTYPE_STATION &&
+ !(sdata->u.mgd.flags & IEEE80211_STA_DISABLE_HE))
+ return;
+
if (qos) {
fc = cpu_to_le16(IEEE80211_FTYPE_DATA |
IEEE80211_STYPE_QOS_NULLFUNC |
@@ -1391,7 +1396,7 @@ static void ieee80211_send_null_response(struct sta_info *sta, int tid,
}
info->band = chanctx_conf->def.chan->band;
- ieee80211_xmit(sdata, sta, skb);
+ ieee80211_xmit(sdata, sta, skb, 0);
rcu_read_unlock();
}
@@ -1968,7 +1973,7 @@ sta_get_last_rx_stats(struct sta_info *sta)
return stats;
}
-static void sta_stats_decode_rate(struct ieee80211_local *local, u16 rate,
+static void sta_stats_decode_rate(struct ieee80211_local *local, u32 rate,
struct rate_info *rinfo)
{
rinfo->bw = STA_STATS_GET(BW, rate);
@@ -2005,6 +2010,14 @@ static void sta_stats_decode_rate(struct ieee80211_local *local, u16 rate,
rinfo->legacy = DIV_ROUND_UP(brate, 1 << shift);
break;
}
+ case STA_STATS_RATE_TYPE_HE:
+ rinfo->flags = RATE_INFO_FLAGS_HE_MCS;
+ rinfo->mcs = STA_STATS_GET(HE_MCS, rate);
+ rinfo->nss = STA_STATS_GET(HE_NSS, rate);
+ rinfo->he_gi = STA_STATS_GET(HE_GI, rate);
+ rinfo->he_ru_alloc = STA_STATS_GET(HE_RU, rate);
+ rinfo->he_dcm = STA_STATS_GET(HE_DCM, rate);
+ break;
}
}
@@ -2101,38 +2114,38 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo,
drv_sta_statistics(local, sdata, &sta->sta, sinfo);
- sinfo->filled |= BIT(NL80211_STA_INFO_INACTIVE_TIME) |
- BIT(NL80211_STA_INFO_STA_FLAGS) |
- BIT(NL80211_STA_INFO_BSS_PARAM) |
- BIT(NL80211_STA_INFO_CONNECTED_TIME) |
- BIT(NL80211_STA_INFO_RX_DROP_MISC);
+ sinfo->filled |= BIT_ULL(NL80211_STA_INFO_INACTIVE_TIME) |
+ BIT_ULL(NL80211_STA_INFO_STA_FLAGS) |
+ BIT_ULL(NL80211_STA_INFO_BSS_PARAM) |
+ BIT_ULL(NL80211_STA_INFO_CONNECTED_TIME) |
+ BIT_ULL(NL80211_STA_INFO_RX_DROP_MISC);
if (sdata->vif.type == NL80211_IFTYPE_STATION) {
sinfo->beacon_loss_count = sdata->u.mgd.beacon_loss_count;
- sinfo->filled |= BIT(NL80211_STA_INFO_BEACON_LOSS);
+ sinfo->filled |= BIT_ULL(NL80211_STA_INFO_BEACON_LOSS);
}
sinfo->connected_time = ktime_get_seconds() - sta->last_connected;
sinfo->inactive_time =
jiffies_to_msecs(jiffies - ieee80211_sta_last_active(sta));
- if (!(sinfo->filled & (BIT(NL80211_STA_INFO_TX_BYTES64) |
- BIT(NL80211_STA_INFO_TX_BYTES)))) {
+ if (!(sinfo->filled & (BIT_ULL(NL80211_STA_INFO_TX_BYTES64) |
+ BIT_ULL(NL80211_STA_INFO_TX_BYTES)))) {
sinfo->tx_bytes = 0;
for (ac = 0; ac < IEEE80211_NUM_ACS; ac++)
sinfo->tx_bytes += sta->tx_stats.bytes[ac];
- sinfo->filled |= BIT(NL80211_STA_INFO_TX_BYTES64);
+ sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_BYTES64);
}
- if (!(sinfo->filled & BIT(NL80211_STA_INFO_TX_PACKETS))) {
+ if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_PACKETS))) {
sinfo->tx_packets = 0;
for (ac = 0; ac < IEEE80211_NUM_ACS; ac++)
sinfo->tx_packets += sta->tx_stats.packets[ac];
- sinfo->filled |= BIT(NL80211_STA_INFO_TX_PACKETS);
+ sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_PACKETS);
}
- if (!(sinfo->filled & (BIT(NL80211_STA_INFO_RX_BYTES64) |
- BIT(NL80211_STA_INFO_RX_BYTES)))) {
+ if (!(sinfo->filled & (BIT_ULL(NL80211_STA_INFO_RX_BYTES64) |
+ BIT_ULL(NL80211_STA_INFO_RX_BYTES)))) {
sinfo->rx_bytes += sta_get_stats_bytes(&sta->rx_stats);
if (sta->pcpu_rx_stats) {
@@ -2144,10 +2157,10 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo,
}
}
- sinfo->filled |= BIT(NL80211_STA_INFO_RX_BYTES64);
+ sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_BYTES64);
}
- if (!(sinfo->filled & BIT(NL80211_STA_INFO_RX_PACKETS))) {
+ if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_RX_PACKETS))) {
sinfo->rx_packets = sta->rx_stats.packets;
if (sta->pcpu_rx_stats) {
for_each_possible_cpu(cpu) {
@@ -2157,17 +2170,17 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo,
sinfo->rx_packets += cpurxs->packets;
}
}
- sinfo->filled |= BIT(NL80211_STA_INFO_RX_PACKETS);
+ sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_PACKETS);
}
- if (!(sinfo->filled & BIT(NL80211_STA_INFO_TX_RETRIES))) {
+ if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_RETRIES))) {
sinfo->tx_retries = sta->status_stats.retry_count;
- sinfo->filled |= BIT(NL80211_STA_INFO_TX_RETRIES);
+ sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_RETRIES);
}
- if (!(sinfo->filled & BIT(NL80211_STA_INFO_TX_FAILED))) {
+ if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_FAILED))) {
sinfo->tx_failed = sta->status_stats.retry_failed;
- sinfo->filled |= BIT(NL80211_STA_INFO_TX_FAILED);
+ sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_FAILED);
}
sinfo->rx_dropped_misc = sta->rx_stats.dropped;
@@ -2182,23 +2195,23 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo,
if (sdata->vif.type == NL80211_IFTYPE_STATION &&
!(sdata->vif.driver_flags & IEEE80211_VIF_BEACON_FILTER)) {
- sinfo->filled |= BIT(NL80211_STA_INFO_BEACON_RX) |
- BIT(NL80211_STA_INFO_BEACON_SIGNAL_AVG);
+ sinfo->filled |= BIT_ULL(NL80211_STA_INFO_BEACON_RX) |
+ BIT_ULL(NL80211_STA_INFO_BEACON_SIGNAL_AVG);
sinfo->rx_beacon_signal_avg = ieee80211_ave_rssi(&sdata->vif);
}
if (ieee80211_hw_check(&sta->local->hw, SIGNAL_DBM) ||
ieee80211_hw_check(&sta->local->hw, SIGNAL_UNSPEC)) {
- if (!(sinfo->filled & BIT(NL80211_STA_INFO_SIGNAL))) {
+ if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_SIGNAL))) {
sinfo->signal = (s8)last_rxstats->last_signal;
- sinfo->filled |= BIT(NL80211_STA_INFO_SIGNAL);
+ sinfo->filled |= BIT_ULL(NL80211_STA_INFO_SIGNAL);
}
if (!sta->pcpu_rx_stats &&
- !(sinfo->filled & BIT(NL80211_STA_INFO_SIGNAL_AVG))) {
+ !(sinfo->filled & BIT_ULL(NL80211_STA_INFO_SIGNAL_AVG))) {
sinfo->signal_avg =
-ewma_signal_read(&sta->rx_stats_avg.signal);
- sinfo->filled |= BIT(NL80211_STA_INFO_SIGNAL_AVG);
+ sinfo->filled |= BIT_ULL(NL80211_STA_INFO_SIGNAL_AVG);
}
}
@@ -2207,11 +2220,11 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo,
* pcpu statistics
*/
if (last_rxstats->chains &&
- !(sinfo->filled & (BIT(NL80211_STA_INFO_CHAIN_SIGNAL) |
- BIT(NL80211_STA_INFO_CHAIN_SIGNAL_AVG)))) {
- sinfo->filled |= BIT(NL80211_STA_INFO_CHAIN_SIGNAL);
+ !(sinfo->filled & (BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL) |
+ BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL_AVG)))) {
+ sinfo->filled |= BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL);
if (!sta->pcpu_rx_stats)
- sinfo->filled |= BIT(NL80211_STA_INFO_CHAIN_SIGNAL_AVG);
+ sinfo->filled |= BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL_AVG);
sinfo->chains = last_rxstats->chains;
@@ -2223,15 +2236,15 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo,
}
}
- if (!(sinfo->filled & BIT(NL80211_STA_INFO_TX_BITRATE))) {
+ if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_BITRATE))) {
sta_set_rate_info_tx(sta, &sta->tx_stats.last_rate,
&sinfo->txrate);
- sinfo->filled |= BIT(NL80211_STA_INFO_TX_BITRATE);
+ sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_BITRATE);
}
- if (!(sinfo->filled & BIT(NL80211_STA_INFO_RX_BITRATE))) {
+ if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_RX_BITRATE))) {
if (sta_set_rate_info_rx(sta, &sinfo->rxrate) == 0)
- sinfo->filled |= BIT(NL80211_STA_INFO_RX_BITRATE);
+ sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_BITRATE);
}
if (tidstats && !cfg80211_sinfo_alloc_tid_stats(sinfo, GFP_KERNEL)) {
@@ -2244,18 +2257,18 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo,
if (ieee80211_vif_is_mesh(&sdata->vif)) {
#ifdef CONFIG_MAC80211_MESH
- sinfo->filled |= BIT(NL80211_STA_INFO_LLID) |
- BIT(NL80211_STA_INFO_PLID) |
- BIT(NL80211_STA_INFO_PLINK_STATE) |
- BIT(NL80211_STA_INFO_LOCAL_PM) |
- BIT(NL80211_STA_INFO_PEER_PM) |
- BIT(NL80211_STA_INFO_NONPEER_PM);
+ sinfo->filled |= BIT_ULL(NL80211_STA_INFO_LLID) |
+ BIT_ULL(NL80211_STA_INFO_PLID) |
+ BIT_ULL(NL80211_STA_INFO_PLINK_STATE) |
+ BIT_ULL(NL80211_STA_INFO_LOCAL_PM) |
+ BIT_ULL(NL80211_STA_INFO_PEER_PM) |
+ BIT_ULL(NL80211_STA_INFO_NONPEER_PM);
sinfo->llid = sta->mesh->llid;
sinfo->plid = sta->mesh->plid;
sinfo->plink_state = sta->mesh->plink_state;
if (test_sta_flag(sta, WLAN_STA_TOFFSET_KNOWN)) {
- sinfo->filled |= BIT(NL80211_STA_INFO_T_OFFSET);
+ sinfo->filled |= BIT_ULL(NL80211_STA_INFO_T_OFFSET);
sinfo->t_offset = sta->mesh->t_offset;
}
sinfo->local_pm = sta->mesh->local_pm;
@@ -2300,7 +2313,7 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo,
thr = sta_get_expected_throughput(sta);
if (thr != 0) {
- sinfo->filled |= BIT(NL80211_STA_INFO_EXPECTED_THROUGHPUT);
+ sinfo->filled |= BIT_ULL(NL80211_STA_INFO_EXPECTED_THROUGHPUT);
sinfo->expected_throughput = thr;
}
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index 81b35f623792..9a04327d71d1 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -170,7 +170,7 @@ struct tid_ampdu_tx {
u8 dialog_token;
u8 stop_initiator;
bool tx_stop;
- u8 buf_size;
+ u16 buf_size;
u16 failed_bar_ssn;
bool bar_pending;
@@ -405,7 +405,7 @@ struct ieee80211_sta_rx_stats {
int last_signal;
u8 chains;
s8 chain_signal_last[IEEE80211_MAX_CHAINS];
- u16 last_rate;
+ u32 last_rate;
struct u64_stats_sync syncp;
u64 bytes;
u64 msdu[IEEE80211_NUM_TIDS + 1];
@@ -764,6 +764,7 @@ enum sta_stats_type {
STA_STATS_RATE_TYPE_LEGACY,
STA_STATS_RATE_TYPE_HT,
STA_STATS_RATE_TYPE_VHT,
+ STA_STATS_RATE_TYPE_HE,
};
#define STA_STATS_FIELD_HT_MCS GENMASK( 7, 0)
@@ -771,9 +772,14 @@ enum sta_stats_type {
#define STA_STATS_FIELD_LEGACY_BAND GENMASK( 7, 4)
#define STA_STATS_FIELD_VHT_MCS GENMASK( 3, 0)
#define STA_STATS_FIELD_VHT_NSS GENMASK( 7, 4)
+#define STA_STATS_FIELD_HE_MCS GENMASK( 3, 0)
+#define STA_STATS_FIELD_HE_NSS GENMASK( 7, 4)
#define STA_STATS_FIELD_BW GENMASK(11, 8)
#define STA_STATS_FIELD_SGI GENMASK(12, 12)
#define STA_STATS_FIELD_TYPE GENMASK(15, 13)
+#define STA_STATS_FIELD_HE_RU GENMASK(18, 16)
+#define STA_STATS_FIELD_HE_GI GENMASK(20, 19)
+#define STA_STATS_FIELD_HE_DCM GENMASK(21, 21)
#define STA_STATS_FIELD(_n, _v) FIELD_PREP(STA_STATS_FIELD_ ## _n, _v)
#define STA_STATS_GET(_n, _v) FIELD_GET(STA_STATS_FIELD_ ## _n, _v)
@@ -782,7 +788,7 @@ enum sta_stats_type {
static inline u32 sta_stats_encode_rate(struct ieee80211_rx_status *s)
{
- u16 r;
+ u32 r;
r = STA_STATS_FIELD(BW, s->bw);
@@ -804,6 +810,14 @@ static inline u32 sta_stats_encode_rate(struct ieee80211_rx_status *s)
r |= STA_STATS_FIELD(LEGACY_BAND, s->band);
r |= STA_STATS_FIELD(LEGACY_IDX, s->rate_idx);
break;
+ case RX_ENC_HE:
+ r |= STA_STATS_FIELD(TYPE, STA_STATS_RATE_TYPE_HE);
+ r |= STA_STATS_FIELD(HE_NSS, s->nss);
+ r |= STA_STATS_FIELD(HE_MCS, s->rate_idx);
+ r |= STA_STATS_FIELD(HE_GI, s->he_gi);
+ r |= STA_STATS_FIELD(HE_RU, s->he_ru);
+ r |= STA_STATS_FIELD(HE_DCM, s->he_dcm);
+ break;
default:
WARN_ON(1);
return STA_STATS_RATE_INVALID;
diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h
index 80a7edf8d314..0ab69a1964f8 100644
--- a/net/mac80211/trace.h
+++ b/net/mac80211/trace.h
@@ -92,7 +92,7 @@
STA_ENTRY \
__field(u16, tid) \
__field(u16, ssn) \
- __field(u8, buf_size) \
+ __field(u16, buf_size) \
__field(bool, amsdu) \
__field(u16, timeout) \
__field(u16, action)
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index fa1f1e63a264..6a79d564de35 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -825,6 +825,8 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx)
*/
if (!ieee80211_is_data_qos(hdr->frame_control) ||
is_multicast_ether_addr(hdr->addr1)) {
+ if (tx->flags & IEEE80211_TX_NO_SEQNO)
+ return TX_CONTINUE;
/* driver should assign sequence number */
info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ;
/* for pure STA mode without beacons, we can do it */
@@ -1854,7 +1856,7 @@ EXPORT_SYMBOL(ieee80211_tx_prepare_skb);
*/
static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata,
struct sta_info *sta, struct sk_buff *skb,
- bool txpending)
+ bool txpending, u32 txdata_flags)
{
struct ieee80211_local *local = sdata->local;
struct ieee80211_tx_data tx;
@@ -1872,6 +1874,8 @@ static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata,
led_len = skb->len;
res_prepare = ieee80211_tx_prepare(sdata, &tx, sta, skb);
+ tx.flags |= txdata_flags;
+
if (unlikely(res_prepare == TX_DROP)) {
ieee80211_free_txskb(&local->hw, skb);
return true;
@@ -1933,7 +1937,8 @@ static int ieee80211_skb_resize(struct ieee80211_sub_if_data *sdata,
}
void ieee80211_xmit(struct ieee80211_sub_if_data *sdata,
- struct sta_info *sta, struct sk_buff *skb)
+ struct sta_info *sta, struct sk_buff *skb,
+ u32 txdata_flags)
{
struct ieee80211_local *local = sdata->local;
struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
@@ -1968,7 +1973,7 @@ void ieee80211_xmit(struct ieee80211_sub_if_data *sdata,
}
ieee80211_set_qos_hdr(sdata, skb);
- ieee80211_tx(sdata, sta, skb, false);
+ ieee80211_tx(sdata, sta, skb, false, txdata_flags);
}
static bool ieee80211_parse_tx_radiotap(struct ieee80211_local *local,
@@ -2289,7 +2294,7 @@ netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb,
if (!ieee80211_parse_tx_radiotap(local, skb))
goto fail_rcu;
- ieee80211_xmit(sdata, NULL, skb);
+ ieee80211_xmit(sdata, NULL, skb, 0);
rcu_read_unlock();
return NETDEV_TX_OK;
@@ -3648,7 +3653,7 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb,
ieee80211_tx_stats(dev, skb->len);
- ieee80211_xmit(sdata, sta, skb);
+ ieee80211_xmit(sdata, sta, skb, 0);
}
goto out;
out_free:
@@ -3867,7 +3872,7 @@ static bool ieee80211_tx_pending_skb(struct ieee80211_local *local,
return true;
}
info->band = chanctx_conf->def.chan->band;
- result = ieee80211_tx(sdata, NULL, skb, true);
+ result = ieee80211_tx(sdata, NULL, skb, true, 0);
} else {
struct sk_buff_head skbs;
@@ -4783,7 +4788,7 @@ EXPORT_SYMBOL(ieee80211_unreserve_tid);
void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata,
struct sk_buff *skb, int tid,
- enum nl80211_band band)
+ enum nl80211_band band, u32 txdata_flags)
{
int ac = ieee80211_ac_from_tid(tid);
@@ -4800,7 +4805,7 @@ void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata,
*/
local_bh_disable();
IEEE80211_SKB_CB(skb)->band = band;
- ieee80211_xmit(sdata, NULL, skb);
+ ieee80211_xmit(sdata, NULL, skb, txdata_flags);
local_bh_enable();
}
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 5e2e511c4a6f..3e68132a41fa 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -1095,6 +1095,21 @@ u32 ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action,
if (elen >= sizeof(*elems->max_idle_period_ie))
elems->max_idle_period_ie = (void *)pos;
break;
+ case WLAN_EID_EXTENSION:
+ if (pos[0] == WLAN_EID_EXT_HE_MU_EDCA &&
+ elen >= (sizeof(*elems->mu_edca_param_set) + 1)) {
+ elems->mu_edca_param_set = (void *)&pos[1];
+ } else if (pos[0] == WLAN_EID_EXT_HE_CAPABILITY) {
+ elems->he_cap = (void *)&pos[1];
+ elems->he_cap_len = elen - 1;
+ } else if (pos[0] == WLAN_EID_EXT_HE_OPERATION &&
+ elen >= sizeof(*elems->he_operation) &&
+ elen >= ieee80211_he_oper_size(&pos[1])) {
+ elems->he_operation = (void *)&pos[1];
+ } else if (pos[0] == WLAN_EID_EXT_UORA && elen >= 1) {
+ elems->uora_element = (void *)&pos[1];
+ }
+ break;
default:
break;
}
@@ -1353,9 +1368,10 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_local *local,
enum nl80211_band band,
u32 rate_mask,
struct cfg80211_chan_def *chandef,
- size_t *offset)
+ size_t *offset, u32 flags)
{
struct ieee80211_supported_band *sband;
+ const struct ieee80211_sta_he_cap *he_cap;
u8 *pos = buffer, *end = buffer + buffer_len;
size_t noffset;
int supp_rates_len, i;
@@ -1433,6 +1449,9 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_local *local,
chandef->chan->center_freq);
}
+ if (flags & IEEE80211_PROBE_FLAG_MIN_CONTENT)
+ goto done;
+
/* insert custom IEs that go before HT */
if (ie && ie_len) {
static const u8 before_ht[] = {
@@ -1460,11 +1479,6 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_local *local,
sband->ht_cap.cap);
}
- /*
- * If adding more here, adjust code in main.c
- * that calculates local->scan_ies_len.
- */
-
/* insert custom IEs that go before VHT */
if (ie && ie_len) {
static const u8 before_vht[] = {
@@ -1507,9 +1521,43 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_local *local,
sband->vht_cap.cap);
}
+ /* insert custom IEs that go before HE */
+ if (ie && ie_len) {
+ static const u8 before_he[] = {
+ /*
+ * no need to list the ones split off before VHT
+ * or generated here
+ */
+ WLAN_EID_EXTENSION, WLAN_EID_EXT_FILS_REQ_PARAMS,
+ WLAN_EID_AP_CSN,
+ /* TODO: add 11ah/11aj/11ak elements */
+ };
+ noffset = ieee80211_ie_split(ie, ie_len,
+ before_he, ARRAY_SIZE(before_he),
+ *offset);
+ if (end - pos < noffset - *offset)
+ goto out_err;
+ memcpy(pos, ie + *offset, noffset - *offset);
+ pos += noffset - *offset;
+ *offset = noffset;
+ }
+
+ he_cap = ieee80211_get_he_sta_cap(sband);
+ if (he_cap) {
+ pos = ieee80211_ie_build_he_cap(pos, he_cap, end);
+ if (!pos)
+ goto out_err;
+ }
+
+ /*
+ * If adding more here, adjust code in main.c
+ * that calculates local->scan_ies_len.
+ */
+
return pos - buffer;
out_err:
WARN_ONCE(1, "not enough space for preq IEs\n");
+ done:
return pos - buffer;
}
@@ -1518,7 +1566,8 @@ int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer,
struct ieee80211_scan_ies *ie_desc,
const u8 *ie, size_t ie_len,
u8 bands_used, u32 *rate_masks,
- struct cfg80211_chan_def *chandef)
+ struct cfg80211_chan_def *chandef,
+ u32 flags)
{
size_t pos = 0, old_pos = 0, custom_ie_offset = 0;
int i;
@@ -1533,7 +1582,8 @@ int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer,
ie, ie_len, i,
rate_masks[i],
chandef,
- &custom_ie_offset);
+ &custom_ie_offset,
+ flags);
ie_desc->ies[i] = buffer + old_pos;
ie_desc->len[i] = pos - old_pos;
old_pos = pos;
@@ -1561,7 +1611,7 @@ struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata,
struct ieee80211_channel *chan,
const u8 *ssid, size_t ssid_len,
const u8 *ie, size_t ie_len,
- bool directed)
+ u32 flags)
{
struct ieee80211_local *local = sdata->local;
struct cfg80211_chan_def chandef;
@@ -1577,7 +1627,7 @@ struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata,
* badly-behaved APs don't respond when this parameter is included.
*/
chandef.width = sdata->vif.bss_conf.chandef.width;
- if (directed)
+ if (flags & IEEE80211_PROBE_FLAG_DIRECTED)
chandef.chan = NULL;
else
chandef.chan = chan;
@@ -1591,7 +1641,7 @@ struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata,
ies_len = ieee80211_build_preq_ies(local, skb_tail_pointer(skb),
skb_tailroom(skb), &dummy_ie_desc,
ie, ie_len, BIT(chan->band),
- rate_masks, &chandef);
+ rate_masks, &chandef, flags);
skb_put(skb, ies_len);
if (dst) {
@@ -1605,27 +1655,6 @@ struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata,
return skb;
}
-void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata,
- const u8 *src, const u8 *dst,
- const u8 *ssid, size_t ssid_len,
- const u8 *ie, size_t ie_len,
- u32 ratemask, bool directed, u32 tx_flags,
- struct ieee80211_channel *channel, bool scan)
-{
- struct sk_buff *skb;
-
- skb = ieee80211_build_probe_req(sdata, src, dst, ratemask, channel,
- ssid, ssid_len,
- ie, ie_len, directed);
- if (skb) {
- IEEE80211_SKB_CB(skb)->flags |= tx_flags;
- if (scan)
- ieee80211_tx_skb_tid_band(sdata, skb, 7, channel->band);
- else
- ieee80211_tx_skb(sdata, skb);
- }
-}
-
u32 ieee80211_sta_get_rates(struct ieee80211_sub_if_data *sdata,
struct ieee802_11_elems *elems,
enum nl80211_band band, u32 *basic_rates)
@@ -2412,6 +2441,72 @@ u8 *ieee80211_ie_build_vht_cap(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap,
return pos;
}
+u8 *ieee80211_ie_build_he_cap(u8 *pos,
+ const struct ieee80211_sta_he_cap *he_cap,
+ u8 *end)
+{
+ u8 n;
+ u8 ie_len;
+ u8 *orig_pos = pos;
+
+ /* Make sure we have place for the IE */
+ /*
+ * TODO: the 1 added is because this temporarily is under the EXTENSION
+ * IE. Get rid of it when it moves.
+ */
+ if (!he_cap)
+ return orig_pos;
+
+ n = ieee80211_he_mcs_nss_size(&he_cap->he_cap_elem);
+ ie_len = 2 + 1 +
+ sizeof(he_cap->he_cap_elem) + n +
+ ieee80211_he_ppe_size(he_cap->ppe_thres[0],
+ he_cap->he_cap_elem.phy_cap_info);
+
+ if ((end - pos) < ie_len)
+ return orig_pos;
+
+ *pos++ = WLAN_EID_EXTENSION;
+ pos++; /* We'll set the size later below */
+ *pos++ = WLAN_EID_EXT_HE_CAPABILITY;
+
+ /* Fixed data */
+ memcpy(pos, &he_cap->he_cap_elem, sizeof(he_cap->he_cap_elem));
+ pos += sizeof(he_cap->he_cap_elem);
+
+ memcpy(pos, &he_cap->he_mcs_nss_supp, n);
+ pos += n;
+
+ /* Check if PPE Threshold should be present */
+ if ((he_cap->he_cap_elem.phy_cap_info[6] &
+ IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT) == 0)
+ goto end;
+
+ /*
+ * Calculate how many PPET16/PPET8 pairs are to come. Algorithm:
+ * (NSS_M1 + 1) x (num of 1 bits in RU_INDEX_BITMASK)
+ */
+ n = hweight8(he_cap->ppe_thres[0] &
+ IEEE80211_PPE_THRES_RU_INDEX_BITMASK_MASK);
+ n *= (1 + ((he_cap->ppe_thres[0] & IEEE80211_PPE_THRES_NSS_MASK) >>
+ IEEE80211_PPE_THRES_NSS_POS));
+
+ /*
+ * Each pair is 6 bits, and we need to add the 7 "header" bits to the
+ * total size.
+ */
+ n = (n * IEEE80211_PPE_THRES_INFO_PPET_SIZE * 2) + 7;
+ n = DIV_ROUND_UP(n, 8);
+
+ /* Copy PPE Thresholds */
+ memcpy(pos, &he_cap->ppe_thres, n);
+ pos += n;
+
+end:
+ orig_pos[1] = (pos - orig_pos) - 2;
+ return pos;
+}
+
u8 *ieee80211_ie_build_ht_oper(u8 *pos, struct ieee80211_sta_ht_cap *ht_cap,
const struct cfg80211_chan_def *chandef,
u16 prot_mode, bool rifs_mode)
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 168af54db975..dc240cb47ddf 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -603,6 +603,21 @@ void nf_conntrack_destroy(struct nf_conntrack *nfct)
}
EXPORT_SYMBOL(nf_conntrack_destroy);
+bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple,
+ const struct sk_buff *skb)
+{
+ struct nf_ct_hook *ct_hook;
+ bool ret = false;
+
+ rcu_read_lock();
+ ct_hook = rcu_dereference(nf_ct_hook);
+ if (ct_hook)
+ ret = ct_hook->get_tuple_skb(dst_tuple, skb);
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL(nf_ct_get_tuple_skb);
+
/* Built-in default zone used e.g. by modules. */
const struct nf_conntrack_zone nf_ct_zone_dflt = {
.id = NF_CT_DEFAULT_ZONE_ID,
diff --git a/net/netfilter/nf_conntrack_broadcast.c b/net/netfilter/nf_conntrack_broadcast.c
index a1086bdec242..5423b197d98a 100644
--- a/net/netfilter/nf_conntrack_broadcast.c
+++ b/net/netfilter/nf_conntrack_broadcast.c
@@ -32,7 +32,7 @@ int nf_conntrack_broadcast_help(struct sk_buff *skb,
__be32 mask = 0;
/* we're only interested in locally generated packets */
- if (skb->sk == NULL)
+ if (skb->sk == NULL || !net_eq(nf_ct_net(ct), sock_net(skb->sk)))
goto out;
if (rt == NULL || !(rt->rt_flags & RTCF_BROADCAST))
goto out;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 3d5280425027..805500197c22 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1683,6 +1683,41 @@ static int nf_conntrack_update(struct net *net, struct sk_buff *skb)
return 0;
}
+static bool nf_conntrack_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple,
+ const struct sk_buff *skb)
+{
+ const struct nf_conntrack_tuple *src_tuple;
+ const struct nf_conntrack_tuple_hash *hash;
+ struct nf_conntrack_tuple srctuple;
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (ct) {
+ src_tuple = nf_ct_tuple(ct, CTINFO2DIR(ctinfo));
+ memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple));
+ return true;
+ }
+
+ if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb),
+ NFPROTO_IPV4, dev_net(skb->dev),
+ &srctuple))
+ return false;
+
+ hash = nf_conntrack_find_get(dev_net(skb->dev),
+ &nf_ct_zone_dflt,
+ &srctuple);
+ if (!hash)
+ return false;
+
+ ct = nf_ct_tuplehash_to_ctrack(hash);
+ src_tuple = nf_ct_tuple(ct, !hash->tuple.dst.dir);
+ memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple));
+ nf_ct_put(ct);
+
+ return true;
+}
+
/* Bring out ya dead! */
static struct nf_conn *
get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
@@ -2204,6 +2239,7 @@ err_cachep:
static struct nf_ct_hook nf_conntrack_hook = {
.update = nf_conntrack_update,
.destroy = destroy_conntrack,
+ .get_tuple_skb = nf_conntrack_get_tuple_skb,
};
void nf_conntrack_init_end(void)
diff --git a/net/netfilter/nf_log_common.c b/net/netfilter/nf_log_common.c
index dc61399e30be..a8c5c846aec1 100644
--- a/net/netfilter/nf_log_common.c
+++ b/net/netfilter/nf_log_common.c
@@ -132,9 +132,10 @@ int nf_log_dump_tcp_header(struct nf_log_buf *m, const struct sk_buff *skb,
}
EXPORT_SYMBOL_GPL(nf_log_dump_tcp_header);
-void nf_log_dump_sk_uid_gid(struct nf_log_buf *m, struct sock *sk)
+void nf_log_dump_sk_uid_gid(struct net *net, struct nf_log_buf *m,
+ struct sock *sk)
{
- if (!sk || !sk_fullsock(sk))
+ if (!sk || !sk_fullsock(sk) || !net_eq(net, sock_net(sk)))
return;
read_lock_bh(&sk->sk_callback_lock);
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 46f9df99d276..86df2a1666fd 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -108,6 +108,7 @@ int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family)
struct flowi fl;
unsigned int hh_len;
struct dst_entry *dst;
+ struct sock *sk = skb->sk;
int err;
err = xfrm_decode_session(skb, &fl, family);
@@ -119,7 +120,10 @@ int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family)
dst = ((struct xfrm_dst *)dst)->route;
dst_hold(dst);
- dst = xfrm_lookup(net, dst, &fl, skb->sk, 0);
+ if (sk && !net_eq(net, sock_net(sk)))
+ sk = NULL;
+
+ dst = xfrm_lookup(net, dst, &fl, sk, 0);
if (IS_ERR(dst))
return PTR_ERR(dst);
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 896d4a36081d..3f211e1025c1 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -14,6 +14,7 @@
#include <linux/skbuff.h>
#include <linux/netlink.h>
#include <linux/vmalloc.h>
+#include <linux/rhashtable.h>
#include <linux/netfilter.h>
#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nf_tables.h>
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index 1105a23bda5e..2b94dcc43456 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -107,7 +107,8 @@ static void nft_meta_get_eval(const struct nft_expr *expr,
break;
case NFT_META_SKUID:
sk = skb_to_full_sk(skb);
- if (!sk || !sk_fullsock(sk))
+ if (!sk || !sk_fullsock(sk) ||
+ !net_eq(nft_net(pkt), sock_net(sk)))
goto err;
read_lock_bh(&sk->sk_callback_lock);
@@ -123,7 +124,8 @@ static void nft_meta_get_eval(const struct nft_expr *expr,
break;
case NFT_META_SKGID:
sk = skb_to_full_sk(skb);
- if (!sk || !sk_fullsock(sk))
+ if (!sk || !sk_fullsock(sk) ||
+ !net_eq(nft_net(pkt), sock_net(sk)))
goto err;
read_lock_bh(&sk->sk_callback_lock);
@@ -214,7 +216,8 @@ static void nft_meta_get_eval(const struct nft_expr *expr,
#ifdef CONFIG_CGROUP_NET_CLASSID
case NFT_META_CGROUP:
sk = skb_to_full_sk(skb);
- if (!sk || !sk_fullsock(sk))
+ if (!sk || !sk_fullsock(sk) ||
+ !net_eq(nft_net(pkt), sock_net(sk)))
goto err;
*dest = sock_cgroup_classid(&sk->sk_cgrp_data);
break;
diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c
index 74e1b3bd6954..998c2b546f6d 100644
--- a/net/netfilter/nft_socket.c
+++ b/net/netfilter/nft_socket.c
@@ -23,6 +23,9 @@ static void nft_socket_eval(const struct nft_expr *expr,
struct sock *sk = skb->sk;
u32 *dest = &regs->data[priv->dreg];
+ if (sk && !net_eq(nft_net(pkt), sock_net(sk)))
+ sk = NULL;
+
if (!sk)
switch(nft_pf(pkt)) {
case NFPROTO_IPV4:
@@ -39,7 +42,7 @@ static void nft_socket_eval(const struct nft_expr *expr,
return;
}
- if(!sk) {
+ if (!sk) {
nft_reg_store8(dest, 0);
return;
}
diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c
index 7df2dece57d3..5d92e1781980 100644
--- a/net/netfilter/xt_cgroup.c
+++ b/net/netfilter/xt_cgroup.c
@@ -72,8 +72,9 @@ static bool
cgroup_mt_v0(const struct sk_buff *skb, struct xt_action_param *par)
{
const struct xt_cgroup_info_v0 *info = par->matchinfo;
+ struct sock *sk = skb->sk;
- if (skb->sk == NULL || !sk_fullsock(skb->sk))
+ if (!sk || !sk_fullsock(sk) || !net_eq(xt_net(par), sock_net(sk)))
return false;
return (info->id == sock_cgroup_classid(&skb->sk->sk_cgrp_data)) ^
@@ -85,8 +86,9 @@ static bool cgroup_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
const struct xt_cgroup_info_v1 *info = par->matchinfo;
struct sock_cgroup_data *skcd = &skb->sk->sk_cgrp_data;
struct cgroup *ancestor = info->priv;
+ struct sock *sk = skb->sk;
- if (!skb->sk || !sk_fullsock(skb->sk))
+ if (!sk || !sk_fullsock(sk) || !net_eq(xt_net(par), sock_net(sk)))
return false;
if (ancestor)
diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c
index 3d705c688a27..46686fb73784 100644
--- a/net/netfilter/xt_owner.c
+++ b/net/netfilter/xt_owner.c
@@ -67,7 +67,7 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par)
struct sock *sk = skb_to_full_sk(skb);
struct net *net = xt_net(par);
- if (sk == NULL || sk->sk_socket == NULL)
+ if (!sk || !sk->sk_socket || !net_eq(net, sock_net(sk)))
return (info->match ^ info->invert) == 0;
else if (info->match & info->invert & XT_OWNER_SOCKET)
/*
diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c
index 07085c22b19c..f44de4bc2100 100644
--- a/net/netfilter/xt_recent.c
+++ b/net/netfilter/xt_recent.c
@@ -265,7 +265,8 @@ recent_mt(const struct sk_buff *skb, struct xt_action_param *par)
}
/* use TTL as seen before forwarding */
- if (xt_out(par) != NULL && skb->sk == NULL)
+ if (xt_out(par) != NULL &&
+ (!skb->sk || !net_eq(net, sock_net(skb->sk))))
ttl++;
spin_lock_bh(&recent_lock);
diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c
index 5c0779c4fa3c..0472f3472842 100644
--- a/net/netfilter/xt_socket.c
+++ b/net/netfilter/xt_socket.c
@@ -56,8 +56,12 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
struct sk_buff *pskb = (struct sk_buff *)skb;
struct sock *sk = skb->sk;
+ if (!net_eq(xt_net(par), sock_net(sk)))
+ sk = NULL;
+
if (!sk)
sk = nf_sk_lookup_slow_v4(xt_net(par), skb, xt_in(par));
+
if (sk) {
bool wildcard;
bool transparent = true;
@@ -113,8 +117,12 @@ socket_mt6_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par)
struct sk_buff *pskb = (struct sk_buff *)skb;
struct sock *sk = skb->sk;
+ if (!net_eq(xt_net(par), sock_net(sk)))
+ sk = NULL;
+
if (!sk)
sk = nf_sk_lookup_slow_v6(xt_net(par), skb, xt_in(par));
+
if (sk) {
bool wildcard;
bool transparent = true;
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 30a5df27116e..85ae53d8fd09 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -1057,6 +1057,28 @@ static int sample(struct datapath *dp, struct sk_buff *skb,
clone_flow_key);
}
+/* When 'last' is true, clone() should always consume the 'skb'.
+ * Otherwise, clone() should keep 'skb' intact regardless what
+ * actions are executed within clone().
+ */
+static int clone(struct datapath *dp, struct sk_buff *skb,
+ struct sw_flow_key *key, const struct nlattr *attr,
+ bool last)
+{
+ struct nlattr *actions;
+ struct nlattr *clone_arg;
+ int rem = nla_len(attr);
+ bool dont_clone_flow_key;
+
+ /* The first action is always 'OVS_CLONE_ATTR_ARG'. */
+ clone_arg = nla_data(attr);
+ dont_clone_flow_key = nla_get_u32(clone_arg);
+ actions = nla_next(clone_arg, &rem);
+
+ return clone_execute(dp, skb, key, 0, actions, rem, last,
+ !dont_clone_flow_key);
+}
+
static void execute_hash(struct sk_buff *skb, struct sw_flow_key *key,
const struct nlattr *attr)
{
@@ -1336,6 +1358,17 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
consume_skb(skb);
return 0;
}
+ break;
+
+ case OVS_ACTION_ATTR_CLONE: {
+ bool last = nla_is_last(a, rem);
+
+ err = clone(dp, skb, key, a, last);
+ if (last)
+ return err;
+
+ break;
+ }
}
if (unlikely(err)) {
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index 492ab0c36f7c..a70097ecf33c 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -2460,6 +2460,40 @@ static int validate_and_copy_sample(struct net *net, const struct nlattr *attr,
return 0;
}
+static int validate_and_copy_clone(struct net *net,
+ const struct nlattr *attr,
+ const struct sw_flow_key *key,
+ struct sw_flow_actions **sfa,
+ __be16 eth_type, __be16 vlan_tci,
+ bool log, bool last)
+{
+ int start, err;
+ u32 exec;
+
+ if (nla_len(attr) && nla_len(attr) < NLA_HDRLEN)
+ return -EINVAL;
+
+ start = add_nested_action_start(sfa, OVS_ACTION_ATTR_CLONE, log);
+ if (start < 0)
+ return start;
+
+ exec = last || !actions_may_change_flow(attr);
+
+ err = ovs_nla_add_action(sfa, OVS_CLONE_ATTR_EXEC, &exec,
+ sizeof(exec), log);
+ if (err)
+ return err;
+
+ err = __ovs_nla_copy_actions(net, attr, key, sfa,
+ eth_type, vlan_tci, log);
+ if (err)
+ return err;
+
+ add_nested_action_end(*sfa, start);
+
+ return 0;
+}
+
void ovs_match_init(struct sw_flow_match *match,
struct sw_flow_key *key,
bool reset_key,
@@ -2516,7 +2550,9 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
struct ovs_tunnel_info *ovs_tun;
struct nlattr *a;
int err = 0, start, opts_type;
+ __be16 dst_opt_type;
+ dst_opt_type = 0;
ovs_match_init(&match, &key, true, NULL);
opts_type = ip_tun_from_nlattr(nla_data(attr), &match, false, log);
if (opts_type < 0)
@@ -2528,10 +2564,13 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
err = validate_geneve_opts(&key);
if (err < 0)
return err;
+ dst_opt_type = TUNNEL_GENEVE_OPT;
break;
case OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS:
+ dst_opt_type = TUNNEL_VXLAN_OPT;
break;
case OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS:
+ dst_opt_type = TUNNEL_ERSPAN_OPT;
break;
}
}
@@ -2574,7 +2613,7 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
*/
ip_tunnel_info_opts_set(tun_info,
TUN_METADATA_OPTS(&key, key.tun_opts_len),
- key.tun_opts_len);
+ key.tun_opts_len, dst_opt_type);
add_nested_action_end(*sfa, start);
return err;
@@ -2844,6 +2883,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
[OVS_ACTION_ATTR_PUSH_NSH] = (u32)-1,
[OVS_ACTION_ATTR_POP_NSH] = 0,
[OVS_ACTION_ATTR_METER] = sizeof(u32),
+ [OVS_ACTION_ATTR_CLONE] = (u32)-1,
};
const struct ovs_action_push_vlan *vlan;
int type = nla_type(a);
@@ -3033,6 +3073,18 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
/* Non-existent meters are simply ignored. */
break;
+ case OVS_ACTION_ATTR_CLONE: {
+ bool last = nla_is_last(a, rem);
+
+ err = validate_and_copy_clone(net, a, key, sfa,
+ eth_type, vlan_tci,
+ log, last);
+ if (err)
+ return err;
+ skip_copy = true;
+ break;
+ }
+
default:
OVS_NLERR(log, "Unknown Action type %d", type);
return -EINVAL;
@@ -3111,6 +3163,26 @@ out:
return err;
}
+static int clone_action_to_attr(const struct nlattr *attr,
+ struct sk_buff *skb)
+{
+ struct nlattr *start;
+ int err = 0, rem = nla_len(attr);
+
+ start = nla_nest_start(skb, OVS_ACTION_ATTR_CLONE);
+ if (!start)
+ return -EMSGSIZE;
+
+ err = ovs_nla_put_actions(nla_data(attr), rem, skb);
+
+ if (err)
+ nla_nest_cancel(skb, start);
+ else
+ nla_nest_end(skb, start);
+
+ return err;
+}
+
static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb)
{
const struct nlattr *ovs_key = nla_data(a);
@@ -3199,6 +3271,12 @@ int ovs_nla_put_actions(const struct nlattr *attr, int len, struct sk_buff *skb)
return err;
break;
+ case OVS_ACTION_ATTR_CLONE:
+ err = clone_action_to_attr(a, skb);
+ if (err)
+ return err;
+ break;
+
default:
if (nla_put(skb, type, nla_len(a), nla_data(a)))
return -EMSGSIZE;
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 9b27d0cd766d..e3e00d3a972e 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -275,9 +275,10 @@ static bool packet_use_direct_xmit(const struct packet_sock *po)
return po->xmit == packet_direct_xmit;
}
-static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
+static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb,
+ struct net_device *sb_dev)
{
- return (u16) raw_smp_processor_id() % dev->real_num_tx_queues;
+ return dev_pick_tx_cpu_id(dev, skb, sb_dev, NULL);
}
static u16 packet_pick_tx_queue(struct sk_buff *skb)
@@ -291,7 +292,7 @@ static u16 packet_pick_tx_queue(struct sk_buff *skb)
__packet_pick_tx_queue);
queue_index = netdev_cap_txqueue(dev, queue_index);
} else {
- queue_index = __packet_pick_tx_queue(dev, skb);
+ queue_index = __packet_pick_tx_queue(dev, skb, NULL);
}
return queue_index;
@@ -1951,7 +1952,7 @@ retry:
goto out_unlock;
}
- sockc.tsflags = sk->sk_tsflags;
+ sockcm_init(&sockc, sk);
if (msg->msg_controllen) {
err = sock_cmsg_send(sk, msg, &sockc);
if (unlikely(err))
@@ -1962,6 +1963,7 @@ retry:
skb->dev = dev;
skb->priority = sk->sk_priority;
skb->mark = sk->sk_mark;
+ skb->tstamp = sockc.transmit_time;
sock_tx_timestamp(sk, sockc.tsflags, &skb_shinfo(skb)->tx_flags);
@@ -2457,6 +2459,7 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
skb->dev = dev;
skb->priority = po->sk.sk_priority;
skb->mark = po->sk.sk_mark;
+ skb->tstamp = sockc->transmit_time;
sock_tx_timestamp(&po->sk, sockc->tsflags, &skb_shinfo(skb)->tx_flags);
skb_shinfo(skb)->destructor_arg = ph.raw;
@@ -2633,7 +2636,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
if (unlikely(!(dev->flags & IFF_UP)))
goto out_put;
- sockc.tsflags = po->sk.sk_tsflags;
+ sockcm_init(&sockc, &po->sk);
if (msg->msg_controllen) {
err = sock_cmsg_send(&po->sk, msg, &sockc);
if (unlikely(err))
@@ -2829,7 +2832,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
if (unlikely(!(dev->flags & IFF_UP)))
goto out_unlock;
- sockc.tsflags = sk->sk_tsflags;
+ sockcm_init(&sockc, sk);
sockc.mark = sk->sk_mark;
if (msg->msg_controllen) {
err = sock_cmsg_send(sk, msg, &sockc);
@@ -2905,6 +2908,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
skb->dev = dev;
skb->priority = sk->sk_priority;
skb->mark = sockc.mark;
+ skb->tstamp = sockc.transmit_time;
if (has_vnet_hdr) {
err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le());
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index b4e421aa9727..1eaf2550a9f8 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -376,8 +376,6 @@ static void release_refill(struct rds_connection *conn)
* This tries to allocate and post unused work requests after making sure that
* they have all the allocations they need to queue received fragments into
* sockets.
- *
- * -1 is returned if posting fails due to temporary resource exhaustion.
*/
void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp)
{
@@ -1025,7 +1023,6 @@ int rds_ib_recv_path(struct rds_conn_path *cp)
{
struct rds_connection *conn = cp->cp_conn;
struct rds_ib_connection *ic = conn->c_transport_data;
- int ret = 0;
rdsdebug("conn %p\n", conn);
if (rds_conn_up(conn)) {
@@ -1034,7 +1031,7 @@ int rds_ib_recv_path(struct rds_conn_path *cp)
rds_ib_stats_inc(s_ib_rx_refill_from_thread);
}
- return ret;
+ return 0;
}
int rds_ib_recv_init(void)
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index a01169fb5325..7af246764a35 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -183,6 +183,17 @@ config NET_SCH_CBS
To compile this code as a module, choose M here: the
module will be called sch_cbs.
+config NET_SCH_ETF
+ tristate "Earliest TxTime First (ETF)"
+ help
+ Say Y here if you want to use the Earliest TxTime First (ETF) packet
+ scheduling algorithm.
+
+ See the top of <file:net/sched/sch_etf.c> for more details.
+
+ To compile this code as a module, choose M here: the
+ module will be called sch_etf.
+
config NET_SCH_GRED
tristate "Generic Random Early Detection (GRED)"
---help---
@@ -284,6 +295,17 @@ config NET_SCH_FQ_CODEL
If unsure, say N.
+config NET_SCH_CAKE
+ tristate "Common Applications Kept Enhanced (CAKE)"
+ help
+ Say Y here if you want to use the Common Applications Kept Enhanced
+ (CAKE) queue management algorithm.
+
+ To compile this driver as a module, choose M here: the module
+ will be called sch_cake.
+
+ If unsure, say N.
+
config NET_SCH_FQ
tristate "Fair Queue"
help
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 8811d3804878..673ee7d26ff2 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -50,10 +50,12 @@ obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o
obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o
obj-$(CONFIG_NET_SCH_CODEL) += sch_codel.o
obj-$(CONFIG_NET_SCH_FQ_CODEL) += sch_fq_codel.o
+obj-$(CONFIG_NET_SCH_CAKE) += sch_cake.o
obj-$(CONFIG_NET_SCH_FQ) += sch_fq.o
obj-$(CONFIG_NET_SCH_HHF) += sch_hhf.o
obj-$(CONFIG_NET_SCH_PIE) += sch_pie.o
obj-$(CONFIG_NET_SCH_CBS) += sch_cbs.o
+obj-$(CONFIG_NET_SCH_ETF) += sch_etf.o
obj-$(CONFIG_NET_CLS_U32) += cls_u32.o
obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 3f4cf930f809..148a89ab789b 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -55,6 +55,24 @@ static void tcf_action_goto_chain_exec(const struct tc_action *a,
res->goto_tp = rcu_dereference_bh(chain->filter_chain);
}
+static void tcf_free_cookie_rcu(struct rcu_head *p)
+{
+ struct tc_cookie *cookie = container_of(p, struct tc_cookie, rcu);
+
+ kfree(cookie->data);
+ kfree(cookie);
+}
+
+static void tcf_set_action_cookie(struct tc_cookie __rcu **old_cookie,
+ struct tc_cookie *new_cookie)
+{
+ struct tc_cookie *old;
+
+ old = xchg((__force struct tc_cookie **)old_cookie, new_cookie);
+ if (old)
+ call_rcu(&old->rcu, tcf_free_cookie_rcu);
+}
+
/* XXX: For standalone actions, we don't need a RCU grace period either, because
* actions are always connected to filters and filters are already destroyed in
* RCU callbacks, so after a RCU grace period actions are already disconnected
@@ -65,44 +83,64 @@ static void free_tcf(struct tc_action *p)
free_percpu(p->cpu_bstats);
free_percpu(p->cpu_qstats);
- if (p->act_cookie) {
- kfree(p->act_cookie->data);
- kfree(p->act_cookie);
- }
+ tcf_set_action_cookie(&p->act_cookie, NULL);
if (p->goto_chain)
tcf_action_goto_chain_fini(p);
kfree(p);
}
-static void tcf_idr_remove(struct tcf_idrinfo *idrinfo, struct tc_action *p)
+static void tcf_action_cleanup(struct tc_action *p)
{
- spin_lock(&idrinfo->lock);
- idr_remove(&idrinfo->action_idr, p->tcfa_index);
- spin_unlock(&idrinfo->lock);
+ if (p->ops->cleanup)
+ p->ops->cleanup(p);
+
gen_kill_estimator(&p->tcfa_rate_est);
free_tcf(p);
}
+static int __tcf_action_put(struct tc_action *p, bool bind)
+{
+ struct tcf_idrinfo *idrinfo = p->idrinfo;
+
+ if (refcount_dec_and_lock(&p->tcfa_refcnt, &idrinfo->lock)) {
+ if (bind)
+ atomic_dec(&p->tcfa_bindcnt);
+ idr_remove(&idrinfo->action_idr, p->tcfa_index);
+ spin_unlock(&idrinfo->lock);
+
+ tcf_action_cleanup(p);
+ return 1;
+ }
+
+ if (bind)
+ atomic_dec(&p->tcfa_bindcnt);
+
+ return 0;
+}
+
int __tcf_idr_release(struct tc_action *p, bool bind, bool strict)
{
int ret = 0;
- ASSERT_RTNL();
-
+ /* Release with strict==1 and bind==0 is only called through act API
+ * interface (classifiers always bind). Only case when action with
+ * positive reference count and zero bind count can exist is when it was
+ * also created with act API (unbinding last classifier will destroy the
+ * action if it was created by classifier). So only case when bind count
+ * can be changed after initial check is when unbound action is
+ * destroyed by act API while classifier binds to action with same id
+ * concurrently. This result either creation of new action(same behavior
+ * as before), or reusing existing action if concurrent process
+ * increments reference count before action is deleted. Both scenarios
+ * are acceptable.
+ */
if (p) {
- if (bind)
- p->tcfa_bindcnt--;
- else if (strict && p->tcfa_bindcnt > 0)
+ if (!bind && strict && atomic_read(&p->tcfa_bindcnt) > 0)
return -EPERM;
- p->tcfa_refcnt--;
- if (p->tcfa_bindcnt <= 0 && p->tcfa_refcnt <= 0) {
- if (p->ops->cleanup)
- p->ops->cleanup(p);
- tcf_idr_remove(p->idrinfo, p);
+ if (__tcf_action_put(p, bind))
ret = ACT_P_DELETED;
- }
}
return ret;
@@ -111,10 +149,15 @@ EXPORT_SYMBOL(__tcf_idr_release);
static size_t tcf_action_shared_attrs_size(const struct tc_action *act)
{
+ struct tc_cookie *act_cookie;
u32 cookie_len = 0;
- if (act->act_cookie)
- cookie_len = nla_total_size(act->act_cookie->len);
+ rcu_read_lock();
+ act_cookie = rcu_dereference(act->act_cookie);
+
+ if (act_cookie)
+ cookie_len = nla_total_size(act_cookie->len);
+ rcu_read_unlock();
return nla_total_size(0) /* action number nested */
+ nla_total_size(IFNAMSIZ) /* TCA_ACT_KIND */
@@ -257,46 +300,77 @@ int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb,
}
EXPORT_SYMBOL(tcf_generic_walker);
-static struct tc_action *tcf_idr_lookup(u32 index, struct tcf_idrinfo *idrinfo)
+static bool __tcf_idr_check(struct tc_action_net *tn, u32 index,
+ struct tc_action **a, int bind)
{
- struct tc_action *p = NULL;
+ struct tcf_idrinfo *idrinfo = tn->idrinfo;
+ struct tc_action *p;
spin_lock(&idrinfo->lock);
p = idr_find(&idrinfo->action_idr, index);
+ if (IS_ERR(p)) {
+ p = NULL;
+ } else if (p) {
+ refcount_inc(&p->tcfa_refcnt);
+ if (bind)
+ atomic_inc(&p->tcfa_bindcnt);
+ }
spin_unlock(&idrinfo->lock);
- return p;
+ if (p) {
+ *a = p;
+ return true;
+ }
+ return false;
}
int tcf_idr_search(struct tc_action_net *tn, struct tc_action **a, u32 index)
{
- struct tcf_idrinfo *idrinfo = tn->idrinfo;
- struct tc_action *p = tcf_idr_lookup(index, idrinfo);
-
- if (p) {
- *a = p;
- return 1;
- }
- return 0;
+ return __tcf_idr_check(tn, index, a, 0);
}
EXPORT_SYMBOL(tcf_idr_search);
bool tcf_idr_check(struct tc_action_net *tn, u32 index, struct tc_action **a,
int bind)
{
+ return __tcf_idr_check(tn, index, a, bind);
+}
+EXPORT_SYMBOL(tcf_idr_check);
+
+int tcf_idr_delete_index(struct tc_action_net *tn, u32 index)
+{
struct tcf_idrinfo *idrinfo = tn->idrinfo;
- struct tc_action *p = tcf_idr_lookup(index, idrinfo);
+ struct tc_action *p;
+ int ret = 0;
- if (index && p) {
- if (bind)
- p->tcfa_bindcnt++;
- p->tcfa_refcnt++;
- *a = p;
- return true;
+ spin_lock(&idrinfo->lock);
+ p = idr_find(&idrinfo->action_idr, index);
+ if (!p) {
+ spin_unlock(&idrinfo->lock);
+ return -ENOENT;
}
- return false;
+
+ if (!atomic_read(&p->tcfa_bindcnt)) {
+ if (refcount_dec_and_test(&p->tcfa_refcnt)) {
+ struct module *owner = p->ops->owner;
+
+ WARN_ON(p != idr_remove(&idrinfo->action_idr,
+ p->tcfa_index));
+ spin_unlock(&idrinfo->lock);
+
+ tcf_action_cleanup(p);
+ module_put(owner);
+ return 0;
+ }
+ ret = 0;
+ } else {
+ ret = -EPERM;
+ }
+
+ spin_unlock(&idrinfo->lock);
+ return ret;
}
-EXPORT_SYMBOL(tcf_idr_check);
+EXPORT_SYMBOL(tcf_idr_delete_index);
int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
struct tc_action **a, const struct tc_action_ops *ops,
@@ -304,14 +378,13 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
{
struct tc_action *p = kzalloc(ops->size, GFP_KERNEL);
struct tcf_idrinfo *idrinfo = tn->idrinfo;
- struct idr *idr = &idrinfo->action_idr;
int err = -ENOMEM;
if (unlikely(!p))
return -ENOMEM;
- p->tcfa_refcnt = 1;
+ refcount_set(&p->tcfa_refcnt, 1);
if (bind)
- p->tcfa_bindcnt = 1;
+ atomic_set(&p->tcfa_bindcnt, 1);
if (cpustats) {
p->cpu_bstats = netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu);
@@ -322,20 +395,6 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
goto err2;
}
spin_lock_init(&p->tcfa_lock);
- idr_preload(GFP_KERNEL);
- spin_lock(&idrinfo->lock);
- /* user doesn't specify an index */
- if (!index) {
- index = 1;
- err = idr_alloc_u32(idr, NULL, &index, UINT_MAX, GFP_ATOMIC);
- } else {
- err = idr_alloc_u32(idr, NULL, &index, index, GFP_ATOMIC);
- }
- spin_unlock(&idrinfo->lock);
- idr_preload_end();
- if (err)
- goto err3;
-
p->tcfa_index = index;
p->tcfa_tm.install = jiffies;
p->tcfa_tm.lastuse = jiffies;
@@ -345,7 +404,7 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
&p->tcfa_rate_est,
&p->tcfa_lock, NULL, est);
if (err)
- goto err4;
+ goto err3;
}
p->idrinfo = idrinfo;
@@ -353,8 +412,6 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
INIT_LIST_HEAD(&p->list);
*a = p;
return 0;
-err4:
- idr_remove(idr, index);
err3:
free_percpu(p->cpu_qstats);
err2:
@@ -370,11 +427,78 @@ void tcf_idr_insert(struct tc_action_net *tn, struct tc_action *a)
struct tcf_idrinfo *idrinfo = tn->idrinfo;
spin_lock(&idrinfo->lock);
- idr_replace(&idrinfo->action_idr, a, a->tcfa_index);
+ /* Replace ERR_PTR(-EBUSY) allocated by tcf_idr_check_alloc */
+ WARN_ON(!IS_ERR(idr_replace(&idrinfo->action_idr, a, a->tcfa_index)));
spin_unlock(&idrinfo->lock);
}
EXPORT_SYMBOL(tcf_idr_insert);
+/* Cleanup idr index that was allocated but not initialized. */
+
+void tcf_idr_cleanup(struct tc_action_net *tn, u32 index)
+{
+ struct tcf_idrinfo *idrinfo = tn->idrinfo;
+
+ spin_lock(&idrinfo->lock);
+ /* Remove ERR_PTR(-EBUSY) allocated by tcf_idr_check_alloc */
+ WARN_ON(!IS_ERR(idr_remove(&idrinfo->action_idr, index)));
+ spin_unlock(&idrinfo->lock);
+}
+EXPORT_SYMBOL(tcf_idr_cleanup);
+
+/* Check if action with specified index exists. If actions is found, increments
+ * its reference and bind counters, and return 1. Otherwise insert temporary
+ * error pointer (to prevent concurrent users from inserting actions with same
+ * index) and return 0.
+ */
+
+int tcf_idr_check_alloc(struct tc_action_net *tn, u32 *index,
+ struct tc_action **a, int bind)
+{
+ struct tcf_idrinfo *idrinfo = tn->idrinfo;
+ struct tc_action *p;
+ int ret;
+
+again:
+ spin_lock(&idrinfo->lock);
+ if (*index) {
+ p = idr_find(&idrinfo->action_idr, *index);
+ if (IS_ERR(p)) {
+ /* This means that another process allocated
+ * index but did not assign the pointer yet.
+ */
+ spin_unlock(&idrinfo->lock);
+ goto again;
+ }
+
+ if (p) {
+ refcount_inc(&p->tcfa_refcnt);
+ if (bind)
+ atomic_inc(&p->tcfa_bindcnt);
+ *a = p;
+ ret = 1;
+ } else {
+ *a = NULL;
+ ret = idr_alloc_u32(&idrinfo->action_idr, NULL, index,
+ *index, GFP_ATOMIC);
+ if (!ret)
+ idr_replace(&idrinfo->action_idr,
+ ERR_PTR(-EBUSY), *index);
+ }
+ } else {
+ *index = 1;
+ *a = NULL;
+ ret = idr_alloc_u32(&idrinfo->action_idr, NULL, index,
+ UINT_MAX, GFP_ATOMIC);
+ if (!ret)
+ idr_replace(&idrinfo->action_idr, ERR_PTR(-EBUSY),
+ *index);
+ }
+ spin_unlock(&idrinfo->lock);
+ return ret;
+}
+EXPORT_SYMBOL(tcf_idr_check_alloc);
+
void tcf_idrinfo_destroy(const struct tc_action_ops *ops,
struct tcf_idrinfo *idrinfo)
{
@@ -538,13 +662,15 @@ repeat:
}
EXPORT_SYMBOL(tcf_action_exec);
-int tcf_action_destroy(struct list_head *actions, int bind)
+int tcf_action_destroy(struct tc_action *actions[], int bind)
{
const struct tc_action_ops *ops;
- struct tc_action *a, *tmp;
- int ret = 0;
+ struct tc_action *a;
+ int ret = 0, i;
- list_for_each_entry_safe(a, tmp, actions, list) {
+ for (i = 0; i < TCA_ACT_MAX_PRIO && actions[i]; i++) {
+ a = actions[i];
+ actions[i] = NULL;
ops = a->ops;
ret = __tcf_idr_release(a, bind, true);
if (ret == ACT_P_DELETED)
@@ -555,6 +681,24 @@ int tcf_action_destroy(struct list_head *actions, int bind)
return ret;
}
+static int tcf_action_put(struct tc_action *p)
+{
+ return __tcf_action_put(p, false);
+}
+
+static void tcf_action_put_many(struct tc_action *actions[])
+{
+ int i;
+
+ for (i = 0; i < TCA_ACT_MAX_PRIO && actions[i]; i++) {
+ struct tc_action *a = actions[i];
+ const struct tc_action_ops *ops = a->ops;
+
+ if (tcf_action_put(a))
+ module_put(ops->owner);
+ }
+}
+
int
tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
{
@@ -567,16 +711,22 @@ tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
int err = -EINVAL;
unsigned char *b = skb_tail_pointer(skb);
struct nlattr *nest;
+ struct tc_cookie *cookie;
if (nla_put_string(skb, TCA_KIND, a->ops->kind))
goto nla_put_failure;
if (tcf_action_copy_stats(skb, a, 0))
goto nla_put_failure;
- if (a->act_cookie) {
- if (nla_put(skb, TCA_ACT_COOKIE, a->act_cookie->len,
- a->act_cookie->data))
+
+ rcu_read_lock();
+ cookie = rcu_dereference(a->act_cookie);
+ if (cookie) {
+ if (nla_put(skb, TCA_ACT_COOKIE, cookie->len, cookie->data)) {
+ rcu_read_unlock();
goto nla_put_failure;
+ }
}
+ rcu_read_unlock();
nest = nla_nest_start(skb, TCA_OPTIONS);
if (nest == NULL)
@@ -593,14 +743,15 @@ nla_put_failure:
}
EXPORT_SYMBOL(tcf_action_dump_1);
-int tcf_action_dump(struct sk_buff *skb, struct list_head *actions,
+int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[],
int bind, int ref)
{
struct tc_action *a;
- int err = -EINVAL;
+ int err = -EINVAL, i;
struct nlattr *nest;
- list_for_each_entry(a, actions, list) {
+ for (i = 0; i < TCA_ACT_MAX_PRIO && actions[i]; i++) {
+ a = actions[i];
nest = nla_nest_start(skb, a->order);
if (nest == NULL)
goto nla_put_failure;
@@ -638,6 +789,7 @@ static struct tc_cookie *nla_memdup_cookie(struct nlattr **tb)
struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
struct nlattr *nla, struct nlattr *est,
char *name, int ovr, int bind,
+ bool rtnl_held,
struct netlink_ext_ack *extack)
{
struct tc_action *a;
@@ -688,9 +840,11 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
a_o = tc_lookup_action_n(act_name);
if (a_o == NULL) {
#ifdef CONFIG_MODULES
- rtnl_unlock();
+ if (rtnl_held)
+ rtnl_unlock();
request_module("act_%s", act_name);
- rtnl_lock();
+ if (rtnl_held)
+ rtnl_lock();
a_o = tc_lookup_action_n(act_name);
@@ -713,19 +867,15 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
/* backward compatibility for policer */
if (name == NULL)
err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, &a, ovr, bind,
- extack);
+ rtnl_held, extack);
else
- err = a_o->init(net, nla, est, &a, ovr, bind, extack);
+ err = a_o->init(net, nla, est, &a, ovr, bind, rtnl_held,
+ extack);
if (err < 0)
goto err_mod;
- if (name == NULL && tb[TCA_ACT_COOKIE]) {
- if (a->act_cookie) {
- kfree(a->act_cookie->data);
- kfree(a->act_cookie);
- }
- a->act_cookie = cookie;
- }
+ if (!name && tb[TCA_ACT_COOKIE])
+ tcf_set_action_cookie(&a->act_cookie, cookie);
/* module count goes up only when brand new policy is created
* if it exists and is only bound to in a_o->init() then
@@ -737,10 +887,9 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
if (TC_ACT_EXT_CMP(a->tcfa_action, TC_ACT_GOTO_CHAIN)) {
err = tcf_action_goto_chain_init(a, tp);
if (err) {
- LIST_HEAD(actions);
+ struct tc_action *actions[] = { a, NULL };
- list_add_tail(&a->list, &actions);
- tcf_action_destroy(&actions, bind);
+ tcf_action_destroy(actions, bind);
NL_SET_ERR_MSG(extack, "Failed to init TC action chain");
return ERR_PTR(err);
}
@@ -758,21 +907,12 @@ err_out:
return ERR_PTR(err);
}
-static void cleanup_a(struct list_head *actions, int ovr)
-{
- struct tc_action *a;
-
- if (!ovr)
- return;
-
- list_for_each_entry(a, actions, list)
- a->tcfa_refcnt--;
-}
+/* Returns numbers of initialized actions or negative error. */
int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla,
struct nlattr *est, char *name, int ovr, int bind,
- struct list_head *actions, size_t *attr_size,
- struct netlink_ext_ack *extack)
+ struct tc_action *actions[], size_t *attr_size,
+ bool rtnl_held, struct netlink_ext_ack *extack)
{
struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
struct tc_action *act;
@@ -786,25 +926,19 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla,
for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) {
act = tcf_action_init_1(net, tp, tb[i], est, name, ovr, bind,
- extack);
+ rtnl_held, extack);
if (IS_ERR(act)) {
err = PTR_ERR(act);
goto err;
}
act->order = i;
sz += tcf_action_fill_size(act);
- if (ovr)
- act->tcfa_refcnt++;
- list_add_tail(&act->list, actions);
+ /* Start from index 0 */
+ actions[i - 1] = act;
}
*attr_size = tcf_action_full_attrs_size(sz);
-
- /* Remove the temp refcnt which was necessary to protect against
- * destroying an existing action which was being replaced
- */
- cleanup_a(actions, ovr);
- return 0;
+ return i - 1;
err:
tcf_action_destroy(actions, bind);
@@ -855,7 +989,7 @@ errout:
return -1;
}
-static int tca_get_fill(struct sk_buff *skb, struct list_head *actions,
+static int tca_get_fill(struct sk_buff *skb, struct tc_action *actions[],
u32 portid, u32 seq, u16 flags, int event, int bind,
int ref)
{
@@ -891,7 +1025,7 @@ out_nlmsg_trim:
static int
tcf_get_notify(struct net *net, u32 portid, struct nlmsghdr *n,
- struct list_head *actions, int event,
+ struct tc_action *actions[], int event,
struct netlink_ext_ack *extack)
{
struct sk_buff *skb;
@@ -900,7 +1034,7 @@ tcf_get_notify(struct net *net, u32 portid, struct nlmsghdr *n,
if (!skb)
return -ENOBUFS;
if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, event,
- 0, 0) <= 0) {
+ 0, 1) <= 0) {
NL_SET_ERR_MSG(extack, "Failed to fill netlink attributes while adding TC action");
kfree_skb(skb);
return -EINVAL;
@@ -1027,9 +1161,41 @@ err_out:
return err;
}
+static int tcf_action_delete(struct net *net, struct tc_action *actions[],
+ int *acts_deleted, struct netlink_ext_ack *extack)
+{
+ u32 act_index;
+ int ret, i;
+
+ for (i = 0; i < TCA_ACT_MAX_PRIO && actions[i]; i++) {
+ struct tc_action *a = actions[i];
+ const struct tc_action_ops *ops = a->ops;
+
+ /* Actions can be deleted concurrently so we must save their
+ * type and id to search again after reference is released.
+ */
+ act_index = a->tcfa_index;
+
+ if (tcf_action_put(a)) {
+ /* last reference, action was deleted concurrently */
+ module_put(ops->owner);
+ } else {
+ /* now do the delete */
+ ret = ops->delete(net, act_index);
+ if (ret < 0) {
+ *acts_deleted = i + 1;
+ return ret;
+ }
+ }
+ }
+ *acts_deleted = i;
+ return 0;
+}
+
static int
-tcf_del_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions,
- u32 portid, size_t attr_size, struct netlink_ext_ack *extack)
+tcf_del_notify(struct net *net, struct nlmsghdr *n, struct tc_action *actions[],
+ int *acts_deleted, u32 portid, size_t attr_size,
+ struct netlink_ext_ack *extack)
{
int ret;
struct sk_buff *skb;
@@ -1040,14 +1206,14 @@ tcf_del_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions,
return -ENOBUFS;
if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, RTM_DELACTION,
- 0, 1) <= 0) {
+ 0, 2) <= 0) {
NL_SET_ERR_MSG(extack, "Failed to fill netlink TC action attributes");
kfree_skb(skb);
return -EINVAL;
}
/* now do the delete */
- ret = tcf_action_destroy(actions, 0);
+ ret = tcf_action_delete(net, actions, acts_deleted, extack);
if (ret < 0) {
NL_SET_ERR_MSG(extack, "Failed to delete TC action");
kfree_skb(skb);
@@ -1069,7 +1235,8 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
struct tc_action *act;
size_t attr_size = 0;
- LIST_HEAD(actions);
+ struct tc_action *actions[TCA_ACT_MAX_PRIO + 1] = {};
+ int acts_deleted = 0;
ret = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL, extack);
if (ret < 0)
@@ -1091,27 +1258,27 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
}
act->order = i;
attr_size += tcf_action_fill_size(act);
- list_add_tail(&act->list, &actions);
+ actions[i - 1] = act;
}
attr_size = tcf_action_full_attrs_size(attr_size);
if (event == RTM_GETACTION)
- ret = tcf_get_notify(net, portid, n, &actions, event, extack);
+ ret = tcf_get_notify(net, portid, n, actions, event, extack);
else { /* delete */
- ret = tcf_del_notify(net, n, &actions, portid, attr_size, extack);
+ ret = tcf_del_notify(net, n, actions, &acts_deleted, portid,
+ attr_size, extack);
if (ret)
goto err;
return ret;
}
err:
- if (event != RTM_GETACTION)
- tcf_action_destroy(&actions, 0);
+ tcf_action_put_many(&actions[acts_deleted]);
return ret;
}
static int
-tcf_add_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions,
+tcf_add_notify(struct net *net, struct nlmsghdr *n, struct tc_action *actions[],
u32 portid, size_t attr_size, struct netlink_ext_ack *extack)
{
struct sk_buff *skb;
@@ -1142,14 +1309,17 @@ static int tcf_action_add(struct net *net, struct nlattr *nla,
{
size_t attr_size = 0;
int ret = 0;
- LIST_HEAD(actions);
+ struct tc_action *actions[TCA_ACT_MAX_PRIO] = {};
- ret = tcf_action_init(net, NULL, nla, NULL, NULL, ovr, 0, &actions,
- &attr_size, extack);
- if (ret)
+ ret = tcf_action_init(net, NULL, nla, NULL, NULL, ovr, 0, actions,
+ &attr_size, true, extack);
+ if (ret < 0)
return ret;
+ ret = tcf_add_notify(net, n, actions, portid, attr_size, extack);
+ if (ovr)
+ tcf_action_put_many(actions);
- return tcf_add_notify(net, n, &actions, portid, attr_size, extack);
+ return ret;
}
static u32 tcaa_root_flags_allowed = TCA_FLAG_LARGE_DUMP_ON;
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index 18089c02e557..06f743d8ed41 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -141,8 +141,8 @@ static int tcf_bpf_dump(struct sk_buff *skb, struct tc_action *act,
struct tcf_bpf *prog = to_bpf(act);
struct tc_act_bpf opt = {
.index = prog->tcf_index,
- .refcnt = prog->tcf_refcnt - ref,
- .bindcnt = prog->tcf_bindcnt - bind,
+ .refcnt = refcount_read(&prog->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&prog->tcf_bindcnt) - bind,
.action = prog->tcf_action,
};
struct tcf_t tm;
@@ -276,7 +276,8 @@ static void tcf_bpf_prog_fill_cfg(const struct tcf_bpf *prog,
static int tcf_bpf_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **act,
- int replace, int bind, struct netlink_ext_ack *extack)
+ int replace, int bind, bool rtnl_held,
+ struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, bpf_net_id);
struct nlattr *tb[TCA_ACT_BPF_MAX + 1];
@@ -298,21 +299,27 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
parm = nla_data(tb[TCA_ACT_BPF_PARMS]);
- if (!tcf_idr_check(tn, parm->index, act, bind)) {
+ ret = tcf_idr_check_alloc(tn, &parm->index, act, bind);
+ if (!ret) {
ret = tcf_idr_create(tn, parm->index, est, act,
&act_bpf_ops, bind, true);
- if (ret < 0)
+ if (ret < 0) {
+ tcf_idr_cleanup(tn, parm->index);
return ret;
+ }
res = ACT_P_CREATED;
- } else {
+ } else if (ret > 0) {
/* Don't override defaults. */
if (bind)
return 0;
- tcf_idr_release(*act, bind);
- if (!replace)
+ if (!replace) {
+ tcf_idr_release(*act, bind);
return -EEXIST;
+ }
+ } else {
+ return ret;
}
is_bpf = tb[TCA_ACT_BPF_OPS_LEN] && tb[TCA_ACT_BPF_OPS];
@@ -355,8 +362,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
return res;
out:
- if (res == ACT_P_CREATED)
- tcf_idr_release(*act, bind);
+ tcf_idr_release(*act, bind);
return ret;
}
@@ -387,6 +393,13 @@ static int tcf_bpf_search(struct net *net, struct tc_action **a, u32 index,
return tcf_idr_search(tn, a, index);
}
+static int tcf_bpf_delete(struct net *net, u32 index)
+{
+ struct tc_action_net *tn = net_generic(net, bpf_net_id);
+
+ return tcf_idr_delete_index(tn, index);
+}
+
static struct tc_action_ops act_bpf_ops __read_mostly = {
.kind = "bpf",
.type = TCA_ACT_BPF,
@@ -397,6 +410,7 @@ static struct tc_action_ops act_bpf_ops __read_mostly = {
.init = tcf_bpf_init,
.walk = tcf_bpf_walker,
.lookup = tcf_bpf_search,
+ .delete = tcf_bpf_delete,
.size = sizeof(struct tcf_bpf),
};
diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
index e4b880fa51fe..1e31f0e448e2 100644
--- a/net/sched/act_connmark.c
+++ b/net/sched/act_connmark.c
@@ -96,7 +96,7 @@ static const struct nla_policy connmark_policy[TCA_CONNMARK_MAX + 1] = {
static int tcf_connmark_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
- int ovr, int bind,
+ int ovr, int bind, bool rtnl_held,
struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, connmark_net_id);
@@ -118,11 +118,14 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla,
parm = nla_data(tb[TCA_CONNMARK_PARMS]);
- if (!tcf_idr_check(tn, parm->index, a, bind)) {
+ ret = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ if (!ret) {
ret = tcf_idr_create(tn, parm->index, est, a,
&act_connmark_ops, bind, false);
- if (ret)
+ if (ret) {
+ tcf_idr_cleanup(tn, parm->index);
return ret;
+ }
ci = to_connmark(*a);
ci->tcf_action = parm->action;
@@ -131,16 +134,18 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla,
tcf_idr_insert(tn, *a);
ret = ACT_P_CREATED;
- } else {
+ } else if (ret > 0) {
ci = to_connmark(*a);
if (bind)
return 0;
- tcf_idr_release(*a, bind);
- if (!ovr)
+ if (!ovr) {
+ tcf_idr_release(*a, bind);
return -EEXIST;
+ }
/* replacing action and zone */
ci->tcf_action = parm->action;
ci->zone = parm->zone;
+ ret = 0;
}
return ret;
@@ -154,8 +159,8 @@ static inline int tcf_connmark_dump(struct sk_buff *skb, struct tc_action *a,
struct tc_connmark opt = {
.index = ci->tcf_index,
- .refcnt = ci->tcf_refcnt - ref,
- .bindcnt = ci->tcf_bindcnt - bind,
+ .refcnt = refcount_read(&ci->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&ci->tcf_bindcnt) - bind,
.action = ci->tcf_action,
.zone = ci->zone,
};
@@ -193,6 +198,13 @@ static int tcf_connmark_search(struct net *net, struct tc_action **a, u32 index,
return tcf_idr_search(tn, a, index);
}
+static int tcf_connmark_delete(struct net *net, u32 index)
+{
+ struct tc_action_net *tn = net_generic(net, connmark_net_id);
+
+ return tcf_idr_delete_index(tn, index);
+}
+
static struct tc_action_ops act_connmark_ops = {
.kind = "connmark",
.type = TCA_ACT_CONNMARK,
@@ -202,6 +214,7 @@ static struct tc_action_ops act_connmark_ops = {
.init = tcf_connmark_init,
.walk = tcf_connmark_walker,
.lookup = tcf_connmark_search,
+ .delete = tcf_connmark_delete,
.size = sizeof(struct tcf_connmark_info),
};
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index 6e7124e57918..4e8c383f379e 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -46,7 +46,8 @@ static struct tc_action_ops act_csum_ops;
static int tcf_csum_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a, int ovr,
- int bind, struct netlink_ext_ack *extack)
+ int bind, bool rtnl_held,
+ struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, csum_net_id);
struct tcf_csum_params *params_old, *params_new;
@@ -66,18 +67,24 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla,
return -EINVAL;
parm = nla_data(tb[TCA_CSUM_PARMS]);
- if (!tcf_idr_check(tn, parm->index, a, bind)) {
+ err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ if (!err) {
ret = tcf_idr_create(tn, parm->index, est, a,
&act_csum_ops, bind, true);
- if (ret)
+ if (ret) {
+ tcf_idr_cleanup(tn, parm->index);
return ret;
+ }
ret = ACT_P_CREATED;
- } else {
+ } else if (err > 0) {
if (bind)/* dont override defaults */
return 0;
- tcf_idr_release(*a, bind);
- if (!ovr)
+ if (!ovr) {
+ tcf_idr_release(*a, bind);
return -EEXIST;
+ }
+ } else {
+ return err;
}
p = to_tcf_csum(*a);
@@ -85,8 +92,7 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla,
params_new = kzalloc(sizeof(*params_new), GFP_KERNEL);
if (unlikely(!params_new)) {
- if (ret == ACT_P_CREATED)
- tcf_idr_release(*a, bind);
+ tcf_idr_release(*a, bind);
return -ENOMEM;
}
params_old = rtnl_dereference(p->params);
@@ -597,8 +603,8 @@ static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind,
struct tcf_csum_params *params;
struct tc_csum opt = {
.index = p->tcf_index,
- .refcnt = p->tcf_refcnt - ref,
- .bindcnt = p->tcf_bindcnt - bind,
+ .refcnt = refcount_read(&p->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&p->tcf_bindcnt) - bind,
.action = p->tcf_action,
};
struct tcf_t t;
@@ -653,6 +659,13 @@ static size_t tcf_csum_get_fill_size(const struct tc_action *act)
return nla_total_size(sizeof(struct tc_csum));
}
+static int tcf_csum_delete(struct net *net, u32 index)
+{
+ struct tc_action_net *tn = net_generic(net, csum_net_id);
+
+ return tcf_idr_delete_index(tn, index);
+}
+
static struct tc_action_ops act_csum_ops = {
.kind = "csum",
.type = TCA_ACT_CSUM,
@@ -664,6 +677,7 @@ static struct tc_action_ops act_csum_ops = {
.walk = tcf_csum_walker,
.lookup = tcf_csum_search,
.get_fill_size = tcf_csum_get_fill_size,
+ .delete = tcf_csum_delete,
.size = sizeof(struct tcf_csum),
};
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index 4dc4f153cad8..661b72b9147d 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -56,7 +56,8 @@ static const struct nla_policy gact_policy[TCA_GACT_MAX + 1] = {
static int tcf_gact_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
- int ovr, int bind, struct netlink_ext_ack *extack)
+ int ovr, int bind, bool rtnl_held,
+ struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, gact_net_id);
struct nlattr *tb[TCA_GACT_MAX + 1];
@@ -90,18 +91,24 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla,
}
#endif
- if (!tcf_idr_check(tn, parm->index, a, bind)) {
+ err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ if (!err) {
ret = tcf_idr_create(tn, parm->index, est, a,
&act_gact_ops, bind, true);
- if (ret)
+ if (ret) {
+ tcf_idr_cleanup(tn, parm->index);
return ret;
+ }
ret = ACT_P_CREATED;
- } else {
+ } else if (err > 0) {
if (bind)/* dont override defaults */
return 0;
- tcf_idr_release(*a, bind);
- if (!ovr)
+ if (!ovr) {
+ tcf_idr_release(*a, bind);
return -EEXIST;
+ }
+ } else {
+ return err;
}
gact = to_gact(*a);
@@ -169,8 +176,8 @@ static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a,
struct tcf_gact *gact = to_gact(a);
struct tc_gact opt = {
.index = gact->tcf_index,
- .refcnt = gact->tcf_refcnt - ref,
- .bindcnt = gact->tcf_bindcnt - bind,
+ .refcnt = refcount_read(&gact->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&gact->tcf_bindcnt) - bind,
.action = gact->tcf_action,
};
struct tcf_t t;
@@ -230,6 +237,13 @@ static size_t tcf_gact_get_fill_size(const struct tc_action *act)
return sz;
}
+static int tcf_gact_delete(struct net *net, u32 index)
+{
+ struct tc_action_net *tn = net_generic(net, gact_net_id);
+
+ return tcf_idr_delete_index(tn, index);
+}
+
static struct tc_action_ops act_gact_ops = {
.kind = "gact",
.type = TCA_ACT_GACT,
@@ -241,6 +255,7 @@ static struct tc_action_ops act_gact_ops = {
.walk = tcf_gact_walker,
.lookup = tcf_gact_search,
.get_fill_size = tcf_gact_get_fill_size,
+ .delete = tcf_gact_delete,
.size = sizeof(struct tcf_gact),
};
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index 20d7d36b2fc9..3d6e265758c0 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -448,7 +448,8 @@ static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb,
static int tcf_ife_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
- int ovr, int bind, struct netlink_ext_ack *extack)
+ int ovr, int bind, bool rtnl_held,
+ struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, ife_net_id);
struct nlattr *tb[TCA_IFE_MAX + 1];
@@ -483,7 +484,12 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
if (!p)
return -ENOMEM;
- exists = tcf_idr_check(tn, parm->index, a, bind);
+ err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ if (err < 0) {
+ kfree(p);
+ return err;
+ }
+ exists = err;
if (exists && bind) {
kfree(p);
return 0;
@@ -493,16 +499,15 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
ret = tcf_idr_create(tn, parm->index, est, a, &act_ife_ops,
bind, true);
if (ret) {
+ tcf_idr_cleanup(tn, parm->index);
kfree(p);
return ret;
}
ret = ACT_P_CREATED;
- } else {
+ } else if (!ovr) {
tcf_idr_release(*a, bind);
- if (!ovr) {
- kfree(p);
- return -EEXIST;
- }
+ kfree(p);
+ return -EEXIST;
}
ife = to_ife(*a);
@@ -547,6 +552,8 @@ metadata_parse_err:
if (exists)
spin_unlock_bh(&ife->tcf_lock);
+ tcf_idr_release(*a, bind);
+
kfree(p);
return err;
}
@@ -596,8 +603,8 @@ static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind,
struct tcf_ife_params *p = rtnl_dereference(ife->params);
struct tc_ife opt = {
.index = ife->tcf_index,
- .refcnt = ife->tcf_refcnt - ref,
- .bindcnt = ife->tcf_bindcnt - bind,
+ .refcnt = refcount_read(&ife->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&ife->tcf_bindcnt) - bind,
.action = ife->tcf_action,
.flags = p->flags,
};
@@ -843,6 +850,13 @@ static int tcf_ife_search(struct net *net, struct tc_action **a, u32 index,
return tcf_idr_search(tn, a, index);
}
+static int tcf_ife_delete(struct net *net, u32 index)
+{
+ struct tc_action_net *tn = net_generic(net, ife_net_id);
+
+ return tcf_idr_delete_index(tn, index);
+}
+
static struct tc_action_ops act_ife_ops = {
.kind = "ife",
.type = TCA_ACT_IFE,
@@ -853,6 +867,7 @@ static struct tc_action_ops act_ife_ops = {
.init = tcf_ife_init,
.walk = tcf_ife_walker,
.lookup = tcf_ife_search,
+ .delete = tcf_ife_delete,
.size = sizeof(struct tcf_ife_info),
};
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index 14c312d7908f..0dc787a57798 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -119,13 +119,18 @@ static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla,
if (tb[TCA_IPT_INDEX] != NULL)
index = nla_get_u32(tb[TCA_IPT_INDEX]);
- exists = tcf_idr_check(tn, index, a, bind);
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
+ if (err < 0)
+ return err;
+ exists = err;
if (exists && bind)
return 0;
if (tb[TCA_IPT_HOOK] == NULL || tb[TCA_IPT_TARG] == NULL) {
if (exists)
tcf_idr_release(*a, bind);
+ else
+ tcf_idr_cleanup(tn, index);
return -EINVAL;
}
@@ -133,22 +138,27 @@ static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla,
if (nla_len(tb[TCA_IPT_TARG]) < td->u.target_size) {
if (exists)
tcf_idr_release(*a, bind);
+ else
+ tcf_idr_cleanup(tn, index);
return -EINVAL;
}
if (!exists) {
ret = tcf_idr_create(tn, index, est, a, ops, bind,
false);
- if (ret)
+ if (ret) {
+ tcf_idr_cleanup(tn, index);
return ret;
+ }
ret = ACT_P_CREATED;
} else {
if (bind)/* dont override defaults */
return 0;
- tcf_idr_release(*a, bind);
- if (!ovr)
+ if (!ovr) {
+ tcf_idr_release(*a, bind);
return -EEXIST;
+ }
}
hook = nla_get_u32(tb[TCA_IPT_HOOK]);
@@ -196,7 +206,8 @@ err1:
static int tcf_ipt_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a, int ovr,
- int bind, struct netlink_ext_ack *extack)
+ int bind, bool rtnl_held,
+ struct netlink_ext_ack *extack)
{
return __tcf_ipt_init(net, ipt_net_id, nla, est, a, &act_ipt_ops, ovr,
bind);
@@ -204,7 +215,8 @@ static int tcf_ipt_init(struct net *net, struct nlattr *nla,
static int tcf_xt_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a, int ovr,
- int bind, struct netlink_ext_ack *extack)
+ int bind, bool unlocked,
+ struct netlink_ext_ack *extack)
{
return __tcf_ipt_init(net, xt_net_id, nla, est, a, &act_xt_ops, ovr,
bind);
@@ -280,8 +292,8 @@ static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind,
if (unlikely(!t))
goto nla_put_failure;
- c.bindcnt = ipt->tcf_bindcnt - bind;
- c.refcnt = ipt->tcf_refcnt - ref;
+ c.bindcnt = atomic_read(&ipt->tcf_bindcnt) - bind;
+ c.refcnt = refcount_read(&ipt->tcf_refcnt) - ref;
strcpy(t->u.user.name, ipt->tcfi_t->u.kernel.target->name);
if (nla_put(skb, TCA_IPT_TARG, ipt->tcfi_t->u.user.target_size, t) ||
@@ -322,6 +334,13 @@ static int tcf_ipt_search(struct net *net, struct tc_action **a, u32 index,
return tcf_idr_search(tn, a, index);
}
+static int tcf_ipt_delete(struct net *net, u32 index)
+{
+ struct tc_action_net *tn = net_generic(net, ipt_net_id);
+
+ return tcf_idr_delete_index(tn, index);
+}
+
static struct tc_action_ops act_ipt_ops = {
.kind = "ipt",
.type = TCA_ACT_IPT,
@@ -332,6 +351,7 @@ static struct tc_action_ops act_ipt_ops = {
.init = tcf_ipt_init,
.walk = tcf_ipt_walker,
.lookup = tcf_ipt_search,
+ .delete = tcf_ipt_delete,
.size = sizeof(struct tcf_ipt),
};
@@ -372,6 +392,13 @@ static int tcf_xt_search(struct net *net, struct tc_action **a, u32 index,
return tcf_idr_search(tn, a, index);
}
+static int tcf_xt_delete(struct net *net, u32 index)
+{
+ struct tc_action_net *tn = net_generic(net, xt_net_id);
+
+ return tcf_idr_delete_index(tn, index);
+}
+
static struct tc_action_ops act_xt_ops = {
.kind = "xt",
.type = TCA_ACT_XT,
@@ -382,6 +409,7 @@ static struct tc_action_ops act_xt_ops = {
.init = tcf_xt_init,
.walk = tcf_xt_walker,
.lookup = tcf_xt_search,
+ .delete = tcf_xt_delete,
.size = sizeof(struct tcf_ipt),
};
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index fd34015331ab..6afd89a36c69 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -68,8 +68,9 @@ static unsigned int mirred_net_id;
static struct tc_action_ops act_mirred_ops;
static int tcf_mirred_init(struct net *net, struct nlattr *nla,
- struct nlattr *est, struct tc_action **a, int ovr,
- int bind, struct netlink_ext_ack *extack)
+ struct nlattr *est, struct tc_action **a,
+ int ovr, int bind, bool rtnl_held,
+ struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, mirred_net_id);
struct nlattr *tb[TCA_MIRRED_MAX + 1];
@@ -78,7 +79,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
struct tcf_mirred *m;
struct net_device *dev;
bool exists = false;
- int ret;
+ int ret, err;
if (!nla) {
NL_SET_ERR_MSG_MOD(extack, "Mirred requires attributes to be passed");
@@ -93,7 +94,10 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
}
parm = nla_data(tb[TCA_MIRRED_PARMS]);
- exists = tcf_idr_check(tn, parm->index, a, bind);
+ err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ if (err < 0)
+ return err;
+ exists = err;
if (exists && bind)
return 0;
@@ -106,6 +110,8 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
default:
if (exists)
tcf_idr_release(*a, bind);
+ else
+ tcf_idr_cleanup(tn, parm->index);
NL_SET_ERR_MSG_MOD(extack, "Unknown mirred option");
return -EINVAL;
}
@@ -114,6 +120,8 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
if (dev == NULL) {
if (exists)
tcf_idr_release(*a, bind);
+ else
+ tcf_idr_cleanup(tn, parm->index);
return -ENODEV;
}
mac_header_xmit = dev_is_mac_header_xmit(dev);
@@ -123,18 +131,20 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
if (!exists) {
if (!dev) {
+ tcf_idr_cleanup(tn, parm->index);
NL_SET_ERR_MSG_MOD(extack, "Specified device does not exist");
return -EINVAL;
}
ret = tcf_idr_create(tn, parm->index, est, a,
&act_mirred_ops, bind, true);
- if (ret)
+ if (ret) {
+ tcf_idr_cleanup(tn, parm->index);
return ret;
+ }
ret = ACT_P_CREATED;
- } else {
+ } else if (!ovr) {
tcf_idr_release(*a, bind);
- if (!ovr)
- return -EEXIST;
+ return -EEXIST;
}
m = to_mirred(*a);
@@ -250,8 +260,8 @@ static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind,
struct tc_mirred opt = {
.index = m->tcf_index,
.action = m->tcf_action,
- .refcnt = m->tcf_refcnt - ref,
- .bindcnt = m->tcf_bindcnt - bind,
+ .refcnt = refcount_read(&m->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&m->tcf_bindcnt) - bind,
.eaction = m->tcfm_eaction,
.ifindex = dev ? dev->ifindex : 0,
};
@@ -321,6 +331,13 @@ static struct net_device *tcf_mirred_get_dev(const struct tc_action *a)
return rtnl_dereference(m->tcfm_dev);
}
+static int tcf_mirred_delete(struct net *net, u32 index)
+{
+ struct tc_action_net *tn = net_generic(net, mirred_net_id);
+
+ return tcf_idr_delete_index(tn, index);
+}
+
static struct tc_action_ops act_mirred_ops = {
.kind = "mirred",
.type = TCA_ACT_MIRRED,
@@ -334,6 +351,7 @@ static struct tc_action_ops act_mirred_ops = {
.lookup = tcf_mirred_search,
.size = sizeof(struct tcf_mirred),
.get_dev = tcf_mirred_get_dev,
+ .delete = tcf_mirred_delete,
};
static __net_init int mirred_init_net(struct net *net)
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
index 4b5848b6c252..4dd9188a72fd 100644
--- a/net/sched/act_nat.c
+++ b/net/sched/act_nat.c
@@ -38,7 +38,7 @@ static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = {
static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,
struct tc_action **a, int ovr, int bind,
- struct netlink_ext_ack *extack)
+ bool rtnl_held, struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, nat_net_id);
struct nlattr *tb[TCA_NAT_MAX + 1];
@@ -57,18 +57,24 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,
return -EINVAL;
parm = nla_data(tb[TCA_NAT_PARMS]);
- if (!tcf_idr_check(tn, parm->index, a, bind)) {
+ err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ if (!err) {
ret = tcf_idr_create(tn, parm->index, est, a,
&act_nat_ops, bind, false);
- if (ret)
+ if (ret) {
+ tcf_idr_cleanup(tn, parm->index);
return ret;
+ }
ret = ACT_P_CREATED;
- } else {
+ } else if (err > 0) {
if (bind)
return 0;
- tcf_idr_release(*a, bind);
- if (!ovr)
+ if (!ovr) {
+ tcf_idr_release(*a, bind);
return -EEXIST;
+ }
+ } else {
+ return err;
}
p = to_tcf_nat(*a);
@@ -257,8 +263,8 @@ static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a,
.index = p->tcf_index,
.action = p->tcf_action,
- .refcnt = p->tcf_refcnt - ref,
- .bindcnt = p->tcf_bindcnt - bind,
+ .refcnt = refcount_read(&p->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&p->tcf_bindcnt) - bind,
};
struct tcf_t t;
@@ -294,6 +300,13 @@ static int tcf_nat_search(struct net *net, struct tc_action **a, u32 index,
return tcf_idr_search(tn, a, index);
}
+static int tcf_nat_delete(struct net *net, u32 index)
+{
+ struct tc_action_net *tn = net_generic(net, nat_net_id);
+
+ return tcf_idr_delete_index(tn, index);
+}
+
static struct tc_action_ops act_nat_ops = {
.kind = "nat",
.type = TCA_ACT_NAT,
@@ -303,6 +316,7 @@ static struct tc_action_ops act_nat_ops = {
.init = tcf_nat_init,
.walk = tcf_nat_walker,
.lookup = tcf_nat_search,
+ .delete = tcf_nat_delete,
.size = sizeof(struct tcf_nat),
};
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index 8a925c72db5f..cc8ffcd1ddb5 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -132,20 +132,23 @@ static int tcf_pedit_key_ex_dump(struct sk_buff *skb,
static int tcf_pedit_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
- int ovr, int bind, struct netlink_ext_ack *extack)
+ int ovr, int bind, bool rtnl_held,
+ struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, pedit_net_id);
struct nlattr *tb[TCA_PEDIT_MAX + 1];
- struct nlattr *pattr;
- struct tc_pedit *parm;
- int ret = 0, err;
- struct tcf_pedit *p;
struct tc_pedit_key *keys = NULL;
struct tcf_pedit_key_ex *keys_ex;
+ struct tc_pedit *parm;
+ struct nlattr *pattr;
+ struct tcf_pedit *p;
+ int ret = 0, err;
int ksize;
- if (nla == NULL)
+ if (!nla) {
+ NL_SET_ERR_MSG_MOD(extack, "Pedit requires attributes to be passed");
return -EINVAL;
+ }
err = nla_parse_nested(tb, TCA_PEDIT_MAX, nla, pedit_policy, NULL);
if (err < 0)
@@ -154,47 +157,62 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
pattr = tb[TCA_PEDIT_PARMS];
if (!pattr)
pattr = tb[TCA_PEDIT_PARMS_EX];
- if (!pattr)
+ if (!pattr) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing required TCA_PEDIT_PARMS or TCA_PEDIT_PARMS_EX pedit attribute");
return -EINVAL;
+ }
parm = nla_data(pattr);
ksize = parm->nkeys * sizeof(struct tc_pedit_key);
- if (nla_len(pattr) < sizeof(*parm) + ksize)
+ if (nla_len(pattr) < sizeof(*parm) + ksize) {
+ NL_SET_ERR_MSG_ATTR(extack, pattr, "Length of TCA_PEDIT_PARMS or TCA_PEDIT_PARMS_EX pedit attribute is invalid");
return -EINVAL;
+ }
keys_ex = tcf_pedit_keys_ex_parse(tb[TCA_PEDIT_KEYS_EX], parm->nkeys);
if (IS_ERR(keys_ex))
return PTR_ERR(keys_ex);
- if (!tcf_idr_check(tn, parm->index, a, bind)) {
- if (!parm->nkeys)
- return -EINVAL;
+ err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ if (!err) {
+ if (!parm->nkeys) {
+ tcf_idr_cleanup(tn, parm->index);
+ NL_SET_ERR_MSG_MOD(extack, "Pedit requires keys to be passed");
+ ret = -EINVAL;
+ goto out_free;
+ }
ret = tcf_idr_create(tn, parm->index, est, a,
&act_pedit_ops, bind, false);
- if (ret)
- return ret;
+ if (ret) {
+ tcf_idr_cleanup(tn, parm->index);
+ goto out_free;
+ }
p = to_pedit(*a);
keys = kmalloc(ksize, GFP_KERNEL);
- if (keys == NULL) {
+ if (!keys) {
tcf_idr_release(*a, bind);
- kfree(keys_ex);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto out_free;
}
ret = ACT_P_CREATED;
- } else {
+ } else if (err > 0) {
if (bind)
- return 0;
- tcf_idr_release(*a, bind);
- if (!ovr)
- return -EEXIST;
+ goto out_free;
+ if (!ovr) {
+ tcf_idr_release(*a, bind);
+ ret = -EEXIST;
+ goto out_free;
+ }
p = to_pedit(*a);
if (p->tcfp_nkeys && p->tcfp_nkeys != parm->nkeys) {
keys = kmalloc(ksize, GFP_KERNEL);
if (!keys) {
- kfree(keys_ex);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto out_free;
}
}
+ } else {
+ return err;
}
spin_lock_bh(&p->tcf_lock);
@@ -214,12 +232,17 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
if (ret == ACT_P_CREATED)
tcf_idr_insert(tn, *a);
return ret;
+out_free:
+ kfree(keys_ex);
+ return ret;
+
}
static void tcf_pedit_cleanup(struct tc_action *a)
{
struct tcf_pedit *p = to_pedit(a);
struct tc_pedit_key *keys = p->tcfp_keys;
+
kfree(keys);
kfree(p->tcfp_keys_ex);
}
@@ -284,11 +307,12 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
if (p->tcfp_nkeys > 0) {
struct tc_pedit_key *tkey = p->tcfp_keys;
struct tcf_pedit_key_ex *tkey_ex = p->tcfp_keys_ex;
- enum pedit_header_type htype = TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK;
+ enum pedit_header_type htype =
+ TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK;
enum pedit_cmd cmd = TCA_PEDIT_KEY_EX_CMD_SET;
for (i = p->tcfp_nkeys; i > 0; i--, tkey++) {
- u32 *ptr, _data;
+ u32 *ptr, hdata;
int offset = tkey->off;
int hoffset;
u32 val;
@@ -303,39 +327,39 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
rc = pedit_skb_hdr_offset(skb, htype, &hoffset);
if (rc) {
- pr_info("tc filter pedit bad header type specified (0x%x)\n",
+ pr_info("tc action pedit bad header type specified (0x%x)\n",
htype);
goto bad;
}
if (tkey->offmask) {
- char *d, _d;
+ u8 *d, _d;
if (!offset_valid(skb, hoffset + tkey->at)) {
- pr_info("tc filter pedit 'at' offset %d out of bounds\n",
+ pr_info("tc action pedit 'at' offset %d out of bounds\n",
hoffset + tkey->at);
goto bad;
}
- d = skb_header_pointer(skb, hoffset + tkey->at, 1,
- &_d);
+ d = skb_header_pointer(skb, hoffset + tkey->at,
+ sizeof(_d), &_d);
if (!d)
goto bad;
offset += (*d & tkey->offmask) >> tkey->shift;
}
if (offset % 4) {
- pr_info("tc filter pedit"
- " offset must be on 32 bit boundaries\n");
+ pr_info("tc action pedit offset must be on 32 bit boundaries\n");
goto bad;
}
if (!offset_valid(skb, hoffset + offset)) {
- pr_info("tc filter pedit offset %d out of bounds\n",
+ pr_info("tc action pedit offset %d out of bounds\n",
hoffset + offset);
goto bad;
}
- ptr = skb_header_pointer(skb, hoffset + offset, 4, &_data);
+ ptr = skb_header_pointer(skb, hoffset + offset,
+ sizeof(hdata), &hdata);
if (!ptr)
goto bad;
/* just do it, baby */
@@ -347,19 +371,20 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
val = (*ptr + tkey->val) & ~tkey->mask;
break;
default:
- pr_info("tc filter pedit bad command (%d)\n",
+ pr_info("tc action pedit bad command (%d)\n",
cmd);
goto bad;
}
*ptr = ((*ptr & tkey->mask) ^ val);
- if (ptr == &_data)
+ if (ptr == &hdata)
skb_store_bits(skb, hoffset + offset, ptr, 4);
}
goto done;
- } else
+ } else {
WARN(1, "pedit BUG: index %d\n", p->tcf_index);
+ }
bad:
p->tcf_qstats.overlimits++;
@@ -391,8 +416,8 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a,
opt->nkeys = p->tcfp_nkeys;
opt->flags = p->tcfp_flags;
opt->action = p->tcf_action;
- opt->refcnt = p->tcf_refcnt - ref;
- opt->bindcnt = p->tcf_bindcnt - bind;
+ opt->refcnt = refcount_read(&p->tcf_refcnt) - ref;
+ opt->bindcnt = atomic_read(&p->tcf_bindcnt) - bind;
if (p->tcfp_keys_ex) {
tcf_pedit_key_ex_dump(skb, p->tcfp_keys_ex, p->tcfp_nkeys);
@@ -435,6 +460,13 @@ static int tcf_pedit_search(struct net *net, struct tc_action **a, u32 index,
return tcf_idr_search(tn, a, index);
}
+static int tcf_pedit_delete(struct net *net, u32 index)
+{
+ struct tc_action_net *tn = net_generic(net, pedit_net_id);
+
+ return tcf_idr_delete_index(tn, index);
+}
+
static struct tc_action_ops act_pedit_ops = {
.kind = "pedit",
.type = TCA_ACT_PEDIT,
@@ -445,6 +477,7 @@ static struct tc_action_ops act_pedit_ops = {
.init = tcf_pedit_init,
.walk = tcf_pedit_walker,
.lookup = tcf_pedit_search,
+ .delete = tcf_pedit_delete,
.size = sizeof(struct tcf_pedit),
};
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 4e72bc2a0dfb..1f3192ea8df7 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -75,7 +75,7 @@ static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = {
static int tcf_act_police_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
- int ovr, int bind,
+ int ovr, int bind, bool rtnl_held,
struct netlink_ext_ack *extack)
{
int ret = 0, err;
@@ -101,20 +101,24 @@ static int tcf_act_police_init(struct net *net, struct nlattr *nla,
return -EINVAL;
parm = nla_data(tb[TCA_POLICE_TBF]);
- exists = tcf_idr_check(tn, parm->index, a, bind);
+ err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ if (err < 0)
+ return err;
+ exists = err;
if (exists && bind)
return 0;
if (!exists) {
ret = tcf_idr_create(tn, parm->index, NULL, a,
&act_police_ops, bind, false);
- if (ret)
+ if (ret) {
+ tcf_idr_cleanup(tn, parm->index);
return ret;
+ }
ret = ACT_P_CREATED;
- } else {
+ } else if (!ovr) {
tcf_idr_release(*a, bind);
- if (!ovr)
- return -EEXIST;
+ return -EEXIST;
}
police = to_police(*a);
@@ -195,8 +199,7 @@ static int tcf_act_police_init(struct net *net, struct nlattr *nla,
failure:
qdisc_put_rtab(P_tab);
qdisc_put_rtab(R_tab);
- if (ret == ACT_P_CREATED)
- tcf_idr_release(*a, bind);
+ tcf_idr_release(*a, bind);
return err;
}
@@ -274,8 +277,8 @@ static int tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a,
.action = police->tcf_action,
.mtu = police->tcfp_mtu,
.burst = PSCHED_NS2TICKS(police->tcfp_burst),
- .refcnt = police->tcf_refcnt - ref,
- .bindcnt = police->tcf_bindcnt - bind,
+ .refcnt = refcount_read(&police->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&police->tcf_bindcnt) - bind,
};
struct tcf_t t;
@@ -314,6 +317,13 @@ static int tcf_police_search(struct net *net, struct tc_action **a, u32 index,
return tcf_idr_search(tn, a, index);
}
+static int tcf_police_delete(struct net *net, u32 index)
+{
+ struct tc_action_net *tn = net_generic(net, police_net_id);
+
+ return tcf_idr_delete_index(tn, index);
+}
+
MODULE_AUTHOR("Alexey Kuznetsov");
MODULE_DESCRIPTION("Policing actions");
MODULE_LICENSE("GPL");
@@ -327,6 +337,7 @@ static struct tc_action_ops act_police_ops = {
.init = tcf_act_police_init,
.walk = tcf_act_police_walker,
.lookup = tcf_police_search,
+ .delete = tcf_police_delete,
.size = sizeof(struct tcf_police),
};
diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c
index 5db358497c9e..3079e7be5bde 100644
--- a/net/sched/act_sample.c
+++ b/net/sched/act_sample.c
@@ -37,7 +37,8 @@ static const struct nla_policy sample_policy[TCA_SAMPLE_MAX + 1] = {
static int tcf_sample_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a, int ovr,
- int bind, struct netlink_ext_ack *extack)
+ int bind, bool rtnl_held,
+ struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, sample_net_id);
struct nlattr *tb[TCA_SAMPLE_MAX + 1];
@@ -45,7 +46,7 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla,
struct tc_sample *parm;
struct tcf_sample *s;
bool exists = false;
- int ret;
+ int ret, err;
if (!nla)
return -EINVAL;
@@ -58,20 +59,24 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla,
parm = nla_data(tb[TCA_SAMPLE_PARMS]);
- exists = tcf_idr_check(tn, parm->index, a, bind);
+ err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ if (err < 0)
+ return err;
+ exists = err;
if (exists && bind)
return 0;
if (!exists) {
ret = tcf_idr_create(tn, parm->index, est, a,
&act_sample_ops, bind, false);
- if (ret)
+ if (ret) {
+ tcf_idr_cleanup(tn, parm->index);
return ret;
+ }
ret = ACT_P_CREATED;
- } else {
+ } else if (!ovr) {
tcf_idr_release(*a, bind);
- if (!ovr)
- return -EEXIST;
+ return -EEXIST;
}
s = to_sample(*a);
@@ -80,8 +85,7 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla,
s->psample_group_num = nla_get_u32(tb[TCA_SAMPLE_PSAMPLE_GROUP]);
psample_group = psample_group_get(net, s->psample_group_num);
if (!psample_group) {
- if (ret == ACT_P_CREATED)
- tcf_idr_release(*a, bind);
+ tcf_idr_release(*a, bind);
return -ENOMEM;
}
RCU_INIT_POINTER(s->psample_group, psample_group);
@@ -173,8 +177,8 @@ static int tcf_sample_dump(struct sk_buff *skb, struct tc_action *a,
struct tc_sample opt = {
.index = s->tcf_index,
.action = s->tcf_action,
- .refcnt = s->tcf_refcnt - ref,
- .bindcnt = s->tcf_bindcnt - bind,
+ .refcnt = refcount_read(&s->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&s->tcf_bindcnt) - bind,
};
struct tcf_t t;
@@ -219,6 +223,13 @@ static int tcf_sample_search(struct net *net, struct tc_action **a, u32 index,
return tcf_idr_search(tn, a, index);
}
+static int tcf_sample_delete(struct net *net, u32 index)
+{
+ struct tc_action_net *tn = net_generic(net, sample_net_id);
+
+ return tcf_idr_delete_index(tn, index);
+}
+
static struct tc_action_ops act_sample_ops = {
.kind = "sample",
.type = TCA_ACT_SAMPLE,
@@ -229,6 +240,7 @@ static struct tc_action_ops act_sample_ops = {
.cleanup = tcf_sample_cleanup,
.walk = tcf_sample_walker,
.lookup = tcf_sample_search,
+ .delete = tcf_sample_delete,
.size = sizeof(struct tcf_sample),
};
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index 98c4afe7c15b..aa51152e0066 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -79,7 +79,8 @@ static const struct nla_policy simple_policy[TCA_DEF_MAX + 1] = {
static int tcf_simp_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
- int ovr, int bind, struct netlink_ext_ack *extack)
+ int ovr, int bind, bool rtnl_held,
+ struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, simp_net_id);
struct nlattr *tb[TCA_DEF_MAX + 1];
@@ -99,21 +100,28 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla,
return -EINVAL;
parm = nla_data(tb[TCA_DEF_PARMS]);
- exists = tcf_idr_check(tn, parm->index, a, bind);
+ err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ if (err < 0)
+ return err;
+ exists = err;
if (exists && bind)
return 0;
if (tb[TCA_DEF_DATA] == NULL) {
if (exists)
tcf_idr_release(*a, bind);
+ else
+ tcf_idr_cleanup(tn, parm->index);
return -EINVAL;
}
if (!exists) {
ret = tcf_idr_create(tn, parm->index, est, a,
&act_simp_ops, bind, false);
- if (ret)
+ if (ret) {
+ tcf_idr_cleanup(tn, parm->index);
return ret;
+ }
d = to_defact(*a);
ret = alloc_defdata(d, tb[TCA_DEF_DATA]);
@@ -126,9 +134,10 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla,
} else {
d = to_defact(*a);
- tcf_idr_release(*a, bind);
- if (!ovr)
+ if (!ovr) {
+ tcf_idr_release(*a, bind);
return -EEXIST;
+ }
reset_policy(d, tb[TCA_DEF_DATA], parm);
}
@@ -145,8 +154,8 @@ static int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a,
struct tcf_defact *d = to_defact(a);
struct tc_defact opt = {
.index = d->tcf_index,
- .refcnt = d->tcf_refcnt - ref,
- .bindcnt = d->tcf_bindcnt - bind,
+ .refcnt = refcount_read(&d->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&d->tcf_bindcnt) - bind,
.action = d->tcf_action,
};
struct tcf_t t;
@@ -183,6 +192,13 @@ static int tcf_simp_search(struct net *net, struct tc_action **a, u32 index,
return tcf_idr_search(tn, a, index);
}
+static int tcf_simp_delete(struct net *net, u32 index)
+{
+ struct tc_action_net *tn = net_generic(net, simp_net_id);
+
+ return tcf_idr_delete_index(tn, index);
+}
+
static struct tc_action_ops act_simp_ops = {
.kind = "simple",
.type = TCA_ACT_SIMP,
@@ -193,6 +209,7 @@ static struct tc_action_ops act_simp_ops = {
.init = tcf_simp_init,
.walk = tcf_simp_walker,
.lookup = tcf_simp_search,
+ .delete = tcf_simp_delete,
.size = sizeof(struct tcf_defact),
};
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index 6138d1d71900..da56e6938c9e 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -23,6 +23,9 @@
#include <linux/rtnetlink.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/dsfield.h>
#include <linux/tc_act/tc_skbedit.h>
#include <net/tc_act/tc_skbedit.h>
@@ -34,25 +37,54 @@ static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a,
struct tcf_result *res)
{
struct tcf_skbedit *d = to_skbedit(a);
+ struct tcf_skbedit_params *params;
+ int action;
- spin_lock(&d->tcf_lock);
tcf_lastuse_update(&d->tcf_tm);
- bstats_update(&d->tcf_bstats, skb);
-
- if (d->flags & SKBEDIT_F_PRIORITY)
- skb->priority = d->priority;
- if (d->flags & SKBEDIT_F_QUEUE_MAPPING &&
- skb->dev->real_num_tx_queues > d->queue_mapping)
- skb_set_queue_mapping(skb, d->queue_mapping);
- if (d->flags & SKBEDIT_F_MARK) {
- skb->mark &= ~d->mask;
- skb->mark |= d->mark & d->mask;
+ bstats_cpu_update(this_cpu_ptr(d->common.cpu_bstats), skb);
+
+ rcu_read_lock();
+ params = rcu_dereference(d->params);
+ action = READ_ONCE(d->tcf_action);
+
+ if (params->flags & SKBEDIT_F_PRIORITY)
+ skb->priority = params->priority;
+ if (params->flags & SKBEDIT_F_INHERITDSFIELD) {
+ int wlen = skb_network_offset(skb);
+
+ switch (tc_skb_protocol(skb)) {
+ case htons(ETH_P_IP):
+ wlen += sizeof(struct iphdr);
+ if (!pskb_may_pull(skb, wlen))
+ goto err;
+ skb->priority = ipv4_get_dsfield(ip_hdr(skb)) >> 2;
+ break;
+
+ case htons(ETH_P_IPV6):
+ wlen += sizeof(struct ipv6hdr);
+ if (!pskb_may_pull(skb, wlen))
+ goto err;
+ skb->priority = ipv6_get_dsfield(ipv6_hdr(skb)) >> 2;
+ break;
+ }
}
- if (d->flags & SKBEDIT_F_PTYPE)
- skb->pkt_type = d->ptype;
-
- spin_unlock(&d->tcf_lock);
- return d->tcf_action;
+ if (params->flags & SKBEDIT_F_QUEUE_MAPPING &&
+ skb->dev->real_num_tx_queues > params->queue_mapping)
+ skb_set_queue_mapping(skb, params->queue_mapping);
+ if (params->flags & SKBEDIT_F_MARK) {
+ skb->mark &= ~params->mask;
+ skb->mark |= params->mark & params->mask;
+ }
+ if (params->flags & SKBEDIT_F_PTYPE)
+ skb->pkt_type = params->ptype;
+
+unlock:
+ rcu_read_unlock();
+ return action;
+err:
+ qstats_drop_inc(this_cpu_ptr(d->common.cpu_qstats));
+ action = TC_ACT_SHOT;
+ goto unlock;
}
static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = {
@@ -62,13 +94,16 @@ static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = {
[TCA_SKBEDIT_MARK] = { .len = sizeof(u32) },
[TCA_SKBEDIT_PTYPE] = { .len = sizeof(u16) },
[TCA_SKBEDIT_MASK] = { .len = sizeof(u32) },
+ [TCA_SKBEDIT_FLAGS] = { .len = sizeof(u64) },
};
static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
- int ovr, int bind, struct netlink_ext_ack *extack)
+ int ovr, int bind, bool rtnl_held,
+ struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, skbedit_net_id);
+ struct tcf_skbedit_params *params_old, *params_new;
struct nlattr *tb[TCA_SKBEDIT_MAX + 1];
struct tc_skbedit *parm;
struct tcf_skbedit *d;
@@ -114,52 +149,76 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
mask = nla_data(tb[TCA_SKBEDIT_MASK]);
}
+ if (tb[TCA_SKBEDIT_FLAGS] != NULL) {
+ u64 *pure_flags = nla_data(tb[TCA_SKBEDIT_FLAGS]);
+
+ if (*pure_flags & SKBEDIT_F_INHERITDSFIELD)
+ flags |= SKBEDIT_F_INHERITDSFIELD;
+ }
+
parm = nla_data(tb[TCA_SKBEDIT_PARMS]);
- exists = tcf_idr_check(tn, parm->index, a, bind);
+ err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ if (err < 0)
+ return err;
+ exists = err;
if (exists && bind)
return 0;
if (!flags) {
if (exists)
tcf_idr_release(*a, bind);
+ else
+ tcf_idr_cleanup(tn, parm->index);
return -EINVAL;
}
if (!exists) {
ret = tcf_idr_create(tn, parm->index, est, a,
- &act_skbedit_ops, bind, false);
- if (ret)
+ &act_skbedit_ops, bind, true);
+ if (ret) {
+ tcf_idr_cleanup(tn, parm->index);
return ret;
+ }
d = to_skbedit(*a);
ret = ACT_P_CREATED;
} else {
d = to_skbedit(*a);
- tcf_idr_release(*a, bind);
- if (!ovr)
+ if (!ovr) {
+ tcf_idr_release(*a, bind);
return -EEXIST;
+ }
}
- spin_lock_bh(&d->tcf_lock);
+ ASSERT_RTNL();
+
+ params_new = kzalloc(sizeof(*params_new), GFP_KERNEL);
+ if (unlikely(!params_new)) {
+ if (ret == ACT_P_CREATED)
+ tcf_idr_release(*a, bind);
+ return -ENOMEM;
+ }
- d->flags = flags;
+ params_new->flags = flags;
if (flags & SKBEDIT_F_PRIORITY)
- d->priority = *priority;
+ params_new->priority = *priority;
if (flags & SKBEDIT_F_QUEUE_MAPPING)
- d->queue_mapping = *queue_mapping;
+ params_new->queue_mapping = *queue_mapping;
if (flags & SKBEDIT_F_MARK)
- d->mark = *mark;
+ params_new->mark = *mark;
if (flags & SKBEDIT_F_PTYPE)
- d->ptype = *ptype;
+ params_new->ptype = *ptype;
/* default behaviour is to use all the bits */
- d->mask = 0xffffffff;
+ params_new->mask = 0xffffffff;
if (flags & SKBEDIT_F_MASK)
- d->mask = *mask;
+ params_new->mask = *mask;
d->tcf_action = parm->action;
-
- spin_unlock_bh(&d->tcf_lock);
+ params_old = rtnl_dereference(d->params);
+ rcu_assign_pointer(d->params, params_new);
+ if (params_old)
+ kfree_rcu(params_old, rcu);
if (ret == ACT_P_CREATED)
tcf_idr_insert(tn, *a);
@@ -171,30 +230,39 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a,
{
unsigned char *b = skb_tail_pointer(skb);
struct tcf_skbedit *d = to_skbedit(a);
+ struct tcf_skbedit_params *params;
struct tc_skbedit opt = {
.index = d->tcf_index,
- .refcnt = d->tcf_refcnt - ref,
- .bindcnt = d->tcf_bindcnt - bind,
+ .refcnt = refcount_read(&d->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&d->tcf_bindcnt) - bind,
.action = d->tcf_action,
};
+ u64 pure_flags = 0;
struct tcf_t t;
+ params = rtnl_dereference(d->params);
+
if (nla_put(skb, TCA_SKBEDIT_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
- if ((d->flags & SKBEDIT_F_PRIORITY) &&
- nla_put_u32(skb, TCA_SKBEDIT_PRIORITY, d->priority))
+ if ((params->flags & SKBEDIT_F_PRIORITY) &&
+ nla_put_u32(skb, TCA_SKBEDIT_PRIORITY, params->priority))
+ goto nla_put_failure;
+ if ((params->flags & SKBEDIT_F_QUEUE_MAPPING) &&
+ nla_put_u16(skb, TCA_SKBEDIT_QUEUE_MAPPING, params->queue_mapping))
goto nla_put_failure;
- if ((d->flags & SKBEDIT_F_QUEUE_MAPPING) &&
- nla_put_u16(skb, TCA_SKBEDIT_QUEUE_MAPPING, d->queue_mapping))
+ if ((params->flags & SKBEDIT_F_MARK) &&
+ nla_put_u32(skb, TCA_SKBEDIT_MARK, params->mark))
goto nla_put_failure;
- if ((d->flags & SKBEDIT_F_MARK) &&
- nla_put_u32(skb, TCA_SKBEDIT_MARK, d->mark))
+ if ((params->flags & SKBEDIT_F_PTYPE) &&
+ nla_put_u16(skb, TCA_SKBEDIT_PTYPE, params->ptype))
goto nla_put_failure;
- if ((d->flags & SKBEDIT_F_PTYPE) &&
- nla_put_u16(skb, TCA_SKBEDIT_PTYPE, d->ptype))
+ if ((params->flags & SKBEDIT_F_MASK) &&
+ nla_put_u32(skb, TCA_SKBEDIT_MASK, params->mask))
goto nla_put_failure;
- if ((d->flags & SKBEDIT_F_MASK) &&
- nla_put_u32(skb, TCA_SKBEDIT_MASK, d->mask))
+ if (params->flags & SKBEDIT_F_INHERITDSFIELD)
+ pure_flags |= SKBEDIT_F_INHERITDSFIELD;
+ if (pure_flags != 0 &&
+ nla_put(skb, TCA_SKBEDIT_FLAGS, sizeof(pure_flags), &pure_flags))
goto nla_put_failure;
tcf_tm_dump(&t, &d->tcf_tm);
@@ -207,6 +275,16 @@ nla_put_failure:
return -1;
}
+static void tcf_skbedit_cleanup(struct tc_action *a)
+{
+ struct tcf_skbedit *d = to_skbedit(a);
+ struct tcf_skbedit_params *params;
+
+ params = rcu_dereference_protected(d->params, 1);
+ if (params)
+ kfree_rcu(params, rcu);
+}
+
static int tcf_skbedit_walker(struct net *net, struct sk_buff *skb,
struct netlink_callback *cb, int type,
const struct tc_action_ops *ops,
@@ -225,6 +303,13 @@ static int tcf_skbedit_search(struct net *net, struct tc_action **a, u32 index,
return tcf_idr_search(tn, a, index);
}
+static int tcf_skbedit_delete(struct net *net, u32 index)
+{
+ struct tc_action_net *tn = net_generic(net, skbedit_net_id);
+
+ return tcf_idr_delete_index(tn, index);
+}
+
static struct tc_action_ops act_skbedit_ops = {
.kind = "skbedit",
.type = TCA_ACT_SKBEDIT,
@@ -232,8 +317,10 @@ static struct tc_action_ops act_skbedit_ops = {
.act = tcf_skbedit,
.dump = tcf_skbedit_dump,
.init = tcf_skbedit_init,
+ .cleanup = tcf_skbedit_cleanup,
.walk = tcf_skbedit_walker,
.lookup = tcf_skbedit_search,
+ .delete = tcf_skbedit_delete,
.size = sizeof(struct tcf_skbedit),
};
diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c
index ad050d7d4b46..cdc6bacfb190 100644
--- a/net/sched/act_skbmod.c
+++ b/net/sched/act_skbmod.c
@@ -84,7 +84,8 @@ static const struct nla_policy skbmod_policy[TCA_SKBMOD_MAX + 1] = {
static int tcf_skbmod_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
- int ovr, int bind, struct netlink_ext_ack *extack)
+ int ovr, int bind, bool rtnl_held,
+ struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, skbmod_net_id);
struct nlattr *tb[TCA_SKBMOD_MAX + 1];
@@ -127,27 +128,33 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla,
if (parm->flags & SKBMOD_F_SWAPMAC)
lflags = SKBMOD_F_SWAPMAC;
- exists = tcf_idr_check(tn, parm->index, a, bind);
+ err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ if (err < 0)
+ return err;
+ exists = err;
if (exists && bind)
return 0;
if (!lflags) {
if (exists)
tcf_idr_release(*a, bind);
+ else
+ tcf_idr_cleanup(tn, parm->index);
return -EINVAL;
}
if (!exists) {
ret = tcf_idr_create(tn, parm->index, est, a,
&act_skbmod_ops, bind, true);
- if (ret)
+ if (ret) {
+ tcf_idr_cleanup(tn, parm->index);
return ret;
+ }
ret = ACT_P_CREATED;
- } else {
+ } else if (!ovr) {
tcf_idr_release(*a, bind);
- if (!ovr)
- return -EEXIST;
+ return -EEXIST;
}
d = to_skbmod(*a);
@@ -155,8 +162,7 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla,
ASSERT_RTNL();
p = kzalloc(sizeof(struct tcf_skbmod_params), GFP_KERNEL);
if (unlikely(!p)) {
- if (ret == ACT_P_CREATED)
- tcf_idr_release(*a, bind);
+ tcf_idr_release(*a, bind);
return -ENOMEM;
}
@@ -205,8 +211,8 @@ static int tcf_skbmod_dump(struct sk_buff *skb, struct tc_action *a,
struct tcf_skbmod_params *p = rtnl_dereference(d->skbmod_p);
struct tc_skbmod opt = {
.index = d->tcf_index,
- .refcnt = d->tcf_refcnt - ref,
- .bindcnt = d->tcf_bindcnt - bind,
+ .refcnt = refcount_read(&d->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&d->tcf_bindcnt) - bind,
.action = d->tcf_action,
};
struct tcf_t t;
@@ -252,6 +258,13 @@ static int tcf_skbmod_search(struct net *net, struct tc_action **a, u32 index,
return tcf_idr_search(tn, a, index);
}
+static int tcf_skbmod_delete(struct net *net, u32 index)
+{
+ struct tc_action_net *tn = net_generic(net, skbmod_net_id);
+
+ return tcf_idr_delete_index(tn, index);
+}
+
static struct tc_action_ops act_skbmod_ops = {
.kind = "skbmod",
.type = TCA_ACT_SKBMOD,
@@ -262,6 +275,7 @@ static struct tc_action_ops act_skbmod_ops = {
.cleanup = tcf_skbmod_cleanup,
.walk = tcf_skbmod_walker,
.lookup = tcf_skbmod_search,
+ .delete = tcf_skbmod_delete,
.size = sizeof(struct tcf_skbmod),
};
diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
index 9bc6c2ae98a5..f811850fd1d0 100644
--- a/net/sched/act_tunnel_key.c
+++ b/net/sched/act_tunnel_key.c
@@ -13,6 +13,7 @@
#include <linux/kernel.h>
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
+#include <net/geneve.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
#include <net/dst.h>
@@ -57,6 +58,135 @@ static int tunnel_key_act(struct sk_buff *skb, const struct tc_action *a,
return action;
}
+static const struct nla_policy
+enc_opts_policy[TCA_TUNNEL_KEY_ENC_OPTS_MAX + 1] = {
+ [TCA_TUNNEL_KEY_ENC_OPTS_GENEVE] = { .type = NLA_NESTED },
+};
+
+static const struct nla_policy
+geneve_opt_policy[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX + 1] = {
+ [TCA_TUNNEL_KEY_ENC_OPT_GENEVE_CLASS] = { .type = NLA_U16 },
+ [TCA_TUNNEL_KEY_ENC_OPT_GENEVE_TYPE] = { .type = NLA_U8 },
+ [TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA] = { .type = NLA_BINARY,
+ .len = 128 },
+};
+
+static int
+tunnel_key_copy_geneve_opt(const struct nlattr *nla, void *dst, int dst_len,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX + 1];
+ int err, data_len, opt_len;
+ u8 *data;
+
+ err = nla_parse_nested(tb, TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX,
+ nla, geneve_opt_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (!tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_CLASS] ||
+ !tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_TYPE] ||
+ !tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA]) {
+ NL_SET_ERR_MSG(extack, "Missing tunnel key geneve option class, type or data");
+ return -EINVAL;
+ }
+
+ data = nla_data(tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA]);
+ data_len = nla_len(tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA]);
+ if (data_len < 4) {
+ NL_SET_ERR_MSG(extack, "Tunnel key geneve option data is less than 4 bytes long");
+ return -ERANGE;
+ }
+ if (data_len % 4) {
+ NL_SET_ERR_MSG(extack, "Tunnel key geneve option data is not a multiple of 4 bytes long");
+ return -ERANGE;
+ }
+
+ opt_len = sizeof(struct geneve_opt) + data_len;
+ if (dst) {
+ struct geneve_opt *opt = dst;
+
+ WARN_ON(dst_len < opt_len);
+
+ opt->opt_class =
+ nla_get_be16(tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_CLASS]);
+ opt->type = nla_get_u8(tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_TYPE]);
+ opt->length = data_len / 4; /* length is in units of 4 bytes */
+ opt->r1 = 0;
+ opt->r2 = 0;
+ opt->r3 = 0;
+
+ memcpy(opt + 1, data, data_len);
+ }
+
+ return opt_len;
+}
+
+static int tunnel_key_copy_opts(const struct nlattr *nla, u8 *dst,
+ int dst_len, struct netlink_ext_ack *extack)
+{
+ int err, rem, opt_len, len = nla_len(nla), opts_len = 0;
+ const struct nlattr *attr, *head = nla_data(nla);
+
+ err = nla_validate(head, len, TCA_TUNNEL_KEY_ENC_OPTS_MAX,
+ enc_opts_policy, extack);
+ if (err)
+ return err;
+
+ nla_for_each_attr(attr, head, len, rem) {
+ switch (nla_type(attr)) {
+ case TCA_TUNNEL_KEY_ENC_OPTS_GENEVE:
+ opt_len = tunnel_key_copy_geneve_opt(attr, dst,
+ dst_len, extack);
+ if (opt_len < 0)
+ return opt_len;
+ opts_len += opt_len;
+ if (dst) {
+ dst_len -= opt_len;
+ dst += opt_len;
+ }
+ break;
+ }
+ }
+
+ if (!opts_len) {
+ NL_SET_ERR_MSG(extack, "Empty list of tunnel options");
+ return -EINVAL;
+ }
+
+ if (rem > 0) {
+ NL_SET_ERR_MSG(extack, "Trailing data after parsing tunnel key options attributes");
+ return -EINVAL;
+ }
+
+ return opts_len;
+}
+
+static int tunnel_key_get_opts_len(struct nlattr *nla,
+ struct netlink_ext_ack *extack)
+{
+ return tunnel_key_copy_opts(nla, NULL, 0, extack);
+}
+
+static int tunnel_key_opts_set(struct nlattr *nla, struct ip_tunnel_info *info,
+ int opts_len, struct netlink_ext_ack *extack)
+{
+ info->options_len = opts_len;
+ switch (nla_type(nla_data(nla))) {
+ case TCA_TUNNEL_KEY_ENC_OPTS_GENEVE:
+#if IS_ENABLED(CONFIG_INET)
+ info->key.tun_flags |= TUNNEL_GENEVE_OPT;
+ return tunnel_key_copy_opts(nla, ip_tunnel_info_opts(info),
+ opts_len, extack);
+#else
+ return -EAFNOSUPPORT;
+#endif
+ default:
+ NL_SET_ERR_MSG(extack, "Cannot set tunnel options for unknown tunnel type");
+ return -EINVAL;
+ }
+}
+
static const struct nla_policy tunnel_key_policy[TCA_TUNNEL_KEY_MAX + 1] = {
[TCA_TUNNEL_KEY_PARMS] = { .len = sizeof(struct tc_tunnel_key) },
[TCA_TUNNEL_KEY_ENC_IPV4_SRC] = { .type = NLA_U32 },
@@ -66,11 +196,15 @@ static const struct nla_policy tunnel_key_policy[TCA_TUNNEL_KEY_MAX + 1] = {
[TCA_TUNNEL_KEY_ENC_KEY_ID] = { .type = NLA_U32 },
[TCA_TUNNEL_KEY_ENC_DST_PORT] = {.type = NLA_U16},
[TCA_TUNNEL_KEY_NO_CSUM] = { .type = NLA_U8 },
+ [TCA_TUNNEL_KEY_ENC_OPTS] = { .type = NLA_NESTED },
+ [TCA_TUNNEL_KEY_ENC_TOS] = { .type = NLA_U8 },
+ [TCA_TUNNEL_KEY_ENC_TTL] = { .type = NLA_U8 },
};
static int tunnel_key_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
- int ovr, int bind, struct netlink_ext_ack *extack)
+ int ovr, int bind, bool rtnl_held,
+ struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
struct nlattr *tb[TCA_TUNNEL_KEY_MAX + 1];
@@ -81,24 +215,35 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
struct tcf_tunnel_key *t;
bool exists = false;
__be16 dst_port = 0;
+ int opts_len = 0;
__be64 key_id;
__be16 flags;
+ u8 tos, ttl;
int ret = 0;
int err;
- if (!nla)
+ if (!nla) {
+ NL_SET_ERR_MSG(extack, "Tunnel requires attributes to be passed");
return -EINVAL;
+ }
err = nla_parse_nested(tb, TCA_TUNNEL_KEY_MAX, nla, tunnel_key_policy,
- NULL);
- if (err < 0)
+ extack);
+ if (err < 0) {
+ NL_SET_ERR_MSG(extack, "Failed to parse nested tunnel key attributes");
return err;
+ }
- if (!tb[TCA_TUNNEL_KEY_PARMS])
+ if (!tb[TCA_TUNNEL_KEY_PARMS]) {
+ NL_SET_ERR_MSG(extack, "Missing tunnel key parameters");
return -EINVAL;
+ }
parm = nla_data(tb[TCA_TUNNEL_KEY_PARMS]);
- exists = tcf_idr_check(tn, parm->index, a, bind);
+ err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ if (err < 0)
+ return err;
+ exists = err;
if (exists && bind)
return 0;
@@ -107,6 +252,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
break;
case TCA_TUNNEL_KEY_ACT_SET:
if (!tb[TCA_TUNNEL_KEY_ENC_KEY_ID]) {
+ NL_SET_ERR_MSG(extack, "Missing tunnel key id");
ret = -EINVAL;
goto err_out;
}
@@ -121,6 +267,22 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
if (tb[TCA_TUNNEL_KEY_ENC_DST_PORT])
dst_port = nla_get_be16(tb[TCA_TUNNEL_KEY_ENC_DST_PORT]);
+ if (tb[TCA_TUNNEL_KEY_ENC_OPTS]) {
+ opts_len = tunnel_key_get_opts_len(tb[TCA_TUNNEL_KEY_ENC_OPTS],
+ extack);
+ if (opts_len < 0) {
+ ret = opts_len;
+ goto err_out;
+ }
+ }
+
+ tos = 0;
+ if (tb[TCA_TUNNEL_KEY_ENC_TOS])
+ tos = nla_get_u8(tb[TCA_TUNNEL_KEY_ENC_TOS]);
+ ttl = 0;
+ if (tb[TCA_TUNNEL_KEY_ENC_TTL])
+ ttl = nla_get_u8(tb[TCA_TUNNEL_KEY_ENC_TTL]);
+
if (tb[TCA_TUNNEL_KEY_ENC_IPV4_SRC] &&
tb[TCA_TUNNEL_KEY_ENC_IPV4_DST]) {
__be32 saddr;
@@ -129,9 +291,9 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
saddr = nla_get_in_addr(tb[TCA_TUNNEL_KEY_ENC_IPV4_SRC]);
daddr = nla_get_in_addr(tb[TCA_TUNNEL_KEY_ENC_IPV4_DST]);
- metadata = __ip_tun_set_dst(saddr, daddr, 0, 0,
+ metadata = __ip_tun_set_dst(saddr, daddr, tos, ttl,
dst_port, flags,
- key_id, 0);
+ key_id, opts_len);
} else if (tb[TCA_TUNNEL_KEY_ENC_IPV6_SRC] &&
tb[TCA_TUNNEL_KEY_ENC_IPV6_DST]) {
struct in6_addr saddr;
@@ -140,19 +302,33 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
saddr = nla_get_in6_addr(tb[TCA_TUNNEL_KEY_ENC_IPV6_SRC]);
daddr = nla_get_in6_addr(tb[TCA_TUNNEL_KEY_ENC_IPV6_DST]);
- metadata = __ipv6_tun_set_dst(&saddr, &daddr, 0, 0, dst_port,
+ metadata = __ipv6_tun_set_dst(&saddr, &daddr, tos, ttl, dst_port,
0, flags,
key_id, 0);
+ } else {
+ NL_SET_ERR_MSG(extack, "Missing either ipv4 or ipv6 src and dst");
+ ret = -EINVAL;
+ goto err_out;
}
if (!metadata) {
- ret = -EINVAL;
+ NL_SET_ERR_MSG(extack, "Cannot allocate tunnel metadata dst");
+ ret = -ENOMEM;
goto err_out;
}
+ if (opts_len) {
+ ret = tunnel_key_opts_set(tb[TCA_TUNNEL_KEY_ENC_OPTS],
+ &metadata->u.tun_info,
+ opts_len, extack);
+ if (ret < 0)
+ goto err_out;
+ }
+
metadata->u.tun_info.mode |= IP_TUNNEL_INFO_TX;
break;
default:
+ NL_SET_ERR_MSG(extack, "Unknown tunnel key action");
ret = -EINVAL;
goto err_out;
}
@@ -160,14 +336,16 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
if (!exists) {
ret = tcf_idr_create(tn, parm->index, est, a,
&act_tunnel_key_ops, bind, true);
- if (ret)
- return ret;
+ if (ret) {
+ NL_SET_ERR_MSG(extack, "Cannot create TC IDR");
+ goto err_out;
+ }
ret = ACT_P_CREATED;
- } else {
+ } else if (!ovr) {
tcf_idr_release(*a, bind);
- if (!ovr)
- return -EEXIST;
+ NL_SET_ERR_MSG(extack, "TC IDR already exists");
+ return -EEXIST;
}
t = to_tunnel_key(*a);
@@ -175,8 +353,8 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
ASSERT_RTNL();
params_new = kzalloc(sizeof(*params_new), GFP_KERNEL);
if (unlikely(!params_new)) {
- if (ret == ACT_P_CREATED)
- tcf_idr_release(*a, bind);
+ tcf_idr_release(*a, bind);
+ NL_SET_ERR_MSG(extack, "Cannot allocate tunnel key parameters");
return -ENOMEM;
}
@@ -199,6 +377,8 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
err_out:
if (exists)
tcf_idr_release(*a, bind);
+ else
+ tcf_idr_cleanup(tn, parm->index);
return ret;
}
@@ -216,6 +396,61 @@ static void tunnel_key_release(struct tc_action *a)
}
}
+static int tunnel_key_geneve_opts_dump(struct sk_buff *skb,
+ const struct ip_tunnel_info *info)
+{
+ int len = info->options_len;
+ u8 *src = (u8 *)(info + 1);
+ struct nlattr *start;
+
+ start = nla_nest_start(skb, TCA_TUNNEL_KEY_ENC_OPTS_GENEVE);
+ if (!start)
+ return -EMSGSIZE;
+
+ while (len > 0) {
+ struct geneve_opt *opt = (struct geneve_opt *)src;
+
+ if (nla_put_be16(skb, TCA_TUNNEL_KEY_ENC_OPT_GENEVE_CLASS,
+ opt->opt_class) ||
+ nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_OPT_GENEVE_TYPE,
+ opt->type) ||
+ nla_put(skb, TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA,
+ opt->length * 4, opt + 1))
+ return -EMSGSIZE;
+
+ len -= sizeof(struct geneve_opt) + opt->length * 4;
+ src += sizeof(struct geneve_opt) + opt->length * 4;
+ }
+
+ nla_nest_end(skb, start);
+ return 0;
+}
+
+static int tunnel_key_opts_dump(struct sk_buff *skb,
+ const struct ip_tunnel_info *info)
+{
+ struct nlattr *start;
+ int err;
+
+ if (!info->options_len)
+ return 0;
+
+ start = nla_nest_start(skb, TCA_TUNNEL_KEY_ENC_OPTS);
+ if (!start)
+ return -EMSGSIZE;
+
+ if (info->key.tun_flags & TUNNEL_GENEVE_OPT) {
+ err = tunnel_key_geneve_opts_dump(skb, info);
+ if (err)
+ return err;
+ } else {
+ return -EINVAL;
+ }
+
+ nla_nest_end(skb, start);
+ return 0;
+}
+
static int tunnel_key_dump_addresses(struct sk_buff *skb,
const struct ip_tunnel_info *info)
{
@@ -252,8 +487,8 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a,
struct tcf_tunnel_key_params *params;
struct tc_tunnel_key opt = {
.index = t->tcf_index,
- .refcnt = t->tcf_refcnt - ref,
- .bindcnt = t->tcf_bindcnt - bind,
+ .refcnt = refcount_read(&t->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&t->tcf_bindcnt) - bind,
.action = t->tcf_action,
};
struct tcf_t tm;
@@ -266,8 +501,9 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a,
goto nla_put_failure;
if (params->tcft_action == TCA_TUNNEL_KEY_ACT_SET) {
- struct ip_tunnel_key *key =
- &params->tcft_enc_metadata->u.tun_info.key;
+ struct ip_tunnel_info *info =
+ &params->tcft_enc_metadata->u.tun_info;
+ struct ip_tunnel_key *key = &info->key;
__be32 key_id = tunnel_id_to_key32(key->tun_id);
if (nla_put_be32(skb, TCA_TUNNEL_KEY_ENC_KEY_ID, key_id) ||
@@ -275,7 +511,14 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a,
&params->tcft_enc_metadata->u.tun_info) ||
nla_put_be16(skb, TCA_TUNNEL_KEY_ENC_DST_PORT, key->tp_dst) ||
nla_put_u8(skb, TCA_TUNNEL_KEY_NO_CSUM,
- !(key->tun_flags & TUNNEL_CSUM)))
+ !(key->tun_flags & TUNNEL_CSUM)) ||
+ tunnel_key_opts_dump(skb, info))
+ goto nla_put_failure;
+
+ if (key->tos && nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_TOS, key->tos))
+ goto nla_put_failure;
+
+ if (key->ttl && nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_TTL, key->ttl))
goto nla_put_failure;
}
@@ -309,6 +552,13 @@ static int tunnel_key_search(struct net *net, struct tc_action **a, u32 index,
return tcf_idr_search(tn, a, index);
}
+static int tunnel_key_delete(struct net *net, u32 index)
+{
+ struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
+
+ return tcf_idr_delete_index(tn, index);
+}
+
static struct tc_action_ops act_tunnel_key_ops = {
.kind = "tunnel_key",
.type = TCA_ACT_TUNNEL_KEY,
@@ -319,6 +569,7 @@ static struct tc_action_ops act_tunnel_key_ops = {
.cleanup = tunnel_key_release,
.walk = tunnel_key_walker,
.lookup = tunnel_key_search,
+ .delete = tunnel_key_delete,
.size = sizeof(struct tcf_tunnel_key),
};
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index 1fb39e1f9d07..ad37f308175a 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -109,7 +109,8 @@ static const struct nla_policy vlan_policy[TCA_VLAN_MAX + 1] = {
static int tcf_vlan_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
- int ovr, int bind, struct netlink_ext_ack *extack)
+ int ovr, int bind, bool rtnl_held,
+ struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, vlan_net_id);
struct nlattr *tb[TCA_VLAN_MAX + 1];
@@ -133,7 +134,10 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
if (!tb[TCA_VLAN_PARMS])
return -EINVAL;
parm = nla_data(tb[TCA_VLAN_PARMS]);
- exists = tcf_idr_check(tn, parm->index, a, bind);
+ err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ if (err < 0)
+ return err;
+ exists = err;
if (exists && bind)
return 0;
@@ -145,12 +149,16 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
if (!tb[TCA_VLAN_PUSH_VLAN_ID]) {
if (exists)
tcf_idr_release(*a, bind);
+ else
+ tcf_idr_cleanup(tn, parm->index);
return -EINVAL;
}
push_vid = nla_get_u16(tb[TCA_VLAN_PUSH_VLAN_ID]);
if (push_vid >= VLAN_VID_MASK) {
if (exists)
tcf_idr_release(*a, bind);
+ else
+ tcf_idr_cleanup(tn, parm->index);
return -ERANGE;
}
@@ -163,6 +171,8 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
default:
if (exists)
tcf_idr_release(*a, bind);
+ else
+ tcf_idr_cleanup(tn, parm->index);
return -EPROTONOSUPPORT;
}
} else {
@@ -175,6 +185,8 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
default:
if (exists)
tcf_idr_release(*a, bind);
+ else
+ tcf_idr_cleanup(tn, parm->index);
return -EINVAL;
}
action = parm->v_action;
@@ -182,14 +194,15 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
if (!exists) {
ret = tcf_idr_create(tn, parm->index, est, a,
&act_vlan_ops, bind, true);
- if (ret)
+ if (ret) {
+ tcf_idr_cleanup(tn, parm->index);
return ret;
+ }
ret = ACT_P_CREATED;
- } else {
+ } else if (!ovr) {
tcf_idr_release(*a, bind);
- if (!ovr)
- return -EEXIST;
+ return -EEXIST;
}
v = to_vlan(*a);
@@ -197,8 +210,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
ASSERT_RTNL();
p = kzalloc(sizeof(*p), GFP_KERNEL);
if (!p) {
- if (ret == ACT_P_CREATED)
- tcf_idr_release(*a, bind);
+ tcf_idr_release(*a, bind);
return -ENOMEM;
}
@@ -239,8 +251,8 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a,
struct tcf_vlan_params *p = rtnl_dereference(v->vlan_p);
struct tc_vlan opt = {
.index = v->tcf_index,
- .refcnt = v->tcf_refcnt - ref,
- .bindcnt = v->tcf_bindcnt - bind,
+ .refcnt = refcount_read(&v->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&v->tcf_bindcnt) - bind,
.action = v->tcf_action,
.v_action = p->tcfv_action,
};
@@ -286,6 +298,13 @@ static int tcf_vlan_search(struct net *net, struct tc_action **a, u32 index,
return tcf_idr_search(tn, a, index);
}
+static int tcf_vlan_delete(struct net *net, u32 index)
+{
+ struct tc_action_net *tn = net_generic(net, vlan_net_id);
+
+ return tcf_idr_delete_index(tn, index);
+}
+
static struct tc_action_ops act_vlan_ops = {
.kind = "vlan",
.type = TCA_ACT_VLAN,
@@ -296,6 +315,7 @@ static struct tc_action_ops act_vlan_ops = {
.cleanup = tcf_vlan_cleanup,
.walk = tcf_vlan_walker,
.lookup = tcf_vlan_search,
+ .delete = tcf_vlan_delete,
.size = sizeof(struct tcf_vlan),
};
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index f74513a7c7a8..623fe2cfe529 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -277,18 +277,21 @@ static bool tcf_block_offload_in_use(struct tcf_block *block)
static int tcf_block_offload_cmd(struct tcf_block *block,
struct net_device *dev,
struct tcf_block_ext_info *ei,
- enum tc_block_command command)
+ enum tc_block_command command,
+ struct netlink_ext_ack *extack)
{
struct tc_block_offload bo = {};
bo.command = command;
bo.binder_type = ei->binder_type;
bo.block = block;
+ bo.extack = extack;
return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo);
}
static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q,
- struct tcf_block_ext_info *ei)
+ struct tcf_block_ext_info *ei,
+ struct netlink_ext_ack *extack)
{
struct net_device *dev = q->dev_queue->dev;
int err;
@@ -299,10 +302,12 @@ static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q,
/* If tc offload feature is disabled and the block we try to bind
* to already has some offloaded filters, forbid to bind.
*/
- if (!tc_can_offload(dev) && tcf_block_offload_in_use(block))
+ if (!tc_can_offload(dev) && tcf_block_offload_in_use(block)) {
+ NL_SET_ERR_MSG(extack, "Bind to offloaded block failed as dev has offload disabled");
return -EOPNOTSUPP;
+ }
- err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_BIND);
+ err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_BIND, extack);
if (err == -EOPNOTSUPP)
goto no_offload_dev_inc;
return err;
@@ -322,7 +327,7 @@ static void tcf_block_offload_unbind(struct tcf_block *block, struct Qdisc *q,
if (!dev->netdev_ops->ndo_setup_tc)
goto no_offload_dev_dec;
- err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_UNBIND);
+ err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_UNBIND, NULL);
if (err == -EOPNOTSUPP)
goto no_offload_dev_dec;
return;
@@ -612,7 +617,7 @@ int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q,
if (err)
goto err_chain_head_change_cb_add;
- err = tcf_block_offload_bind(block, q, ei);
+ err = tcf_block_offload_bind(block, q, ei, extack);
if (err)
goto err_block_offload_bind;
@@ -746,18 +751,53 @@ unsigned int tcf_block_cb_decref(struct tcf_block_cb *block_cb)
}
EXPORT_SYMBOL(tcf_block_cb_decref);
+static int
+tcf_block_playback_offloads(struct tcf_block *block, tc_setup_cb_t *cb,
+ void *cb_priv, bool add, bool offload_in_use,
+ struct netlink_ext_ack *extack)
+{
+ struct tcf_chain *chain;
+ struct tcf_proto *tp;
+ int err;
+
+ list_for_each_entry(chain, &block->chain_list, list) {
+ for (tp = rtnl_dereference(chain->filter_chain); tp;
+ tp = rtnl_dereference(tp->next)) {
+ if (tp->ops->reoffload) {
+ err = tp->ops->reoffload(tp, add, cb, cb_priv,
+ extack);
+ if (err && add)
+ goto err_playback_remove;
+ } else if (add && offload_in_use) {
+ err = -EOPNOTSUPP;
+ NL_SET_ERR_MSG(extack, "Filter HW offload failed - classifier without re-offloading support");
+ goto err_playback_remove;
+ }
+ }
+ }
+
+ return 0;
+
+err_playback_remove:
+ tcf_block_playback_offloads(block, cb, cb_priv, false, offload_in_use,
+ extack);
+ return err;
+}
+
struct tcf_block_cb *__tcf_block_cb_register(struct tcf_block *block,
tc_setup_cb_t *cb, void *cb_ident,
- void *cb_priv)
+ void *cb_priv,
+ struct netlink_ext_ack *extack)
{
struct tcf_block_cb *block_cb;
+ int err;
- /* At this point, playback of previous block cb calls is not supported,
- * so forbid to register to block which already has some offloaded
- * filters present.
- */
- if (tcf_block_offload_in_use(block))
- return ERR_PTR(-EOPNOTSUPP);
+ /* Replay any already present rules */
+ err = tcf_block_playback_offloads(block, cb, cb_priv, true,
+ tcf_block_offload_in_use(block),
+ extack);
+ if (err)
+ return ERR_PTR(err);
block_cb = kzalloc(sizeof(*block_cb), GFP_KERNEL);
if (!block_cb)
@@ -772,17 +812,22 @@ EXPORT_SYMBOL(__tcf_block_cb_register);
int tcf_block_cb_register(struct tcf_block *block,
tc_setup_cb_t *cb, void *cb_ident,
- void *cb_priv)
+ void *cb_priv, struct netlink_ext_ack *extack)
{
struct tcf_block_cb *block_cb;
- block_cb = __tcf_block_cb_register(block, cb, cb_ident, cb_priv);
+ block_cb = __tcf_block_cb_register(block, cb, cb_ident, cb_priv,
+ extack);
return IS_ERR(block_cb) ? PTR_ERR(block_cb) : 0;
}
EXPORT_SYMBOL(tcf_block_cb_register);
-void __tcf_block_cb_unregister(struct tcf_block_cb *block_cb)
+void __tcf_block_cb_unregister(struct tcf_block *block,
+ struct tcf_block_cb *block_cb)
{
+ tcf_block_playback_offloads(block, block_cb->cb, block_cb->cb_priv,
+ false, tcf_block_offload_in_use(block),
+ NULL);
list_del(&block_cb->list);
kfree(block_cb);
}
@@ -796,7 +841,7 @@ void tcf_block_cb_unregister(struct tcf_block *block,
block_cb = tcf_block_cb_lookup(block, cb, cb_ident);
if (!block_cb)
return;
- __tcf_block_cb_unregister(block_cb);
+ __tcf_block_cb_unregister(block, block_cb);
}
EXPORT_SYMBOL(tcf_block_cb_unregister);
@@ -1463,7 +1508,9 @@ static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent,
arg.w.stop = 0;
arg.w.skip = cb->args[1] - 1;
arg.w.count = 0;
+ arg.w.cookie = cb->args[2];
tp->ops->walk(tp, &arg.w);
+ cb->args[2] = arg.w.cookie;
cb->args[1] = arg.w.count + 1;
if (arg.w.stop)
return false;
@@ -1564,11 +1611,7 @@ out:
void tcf_exts_destroy(struct tcf_exts *exts)
{
#ifdef CONFIG_NET_CLS_ACT
- LIST_HEAD(actions);
-
- ASSERT_RTNL();
- tcf_exts_to_list(exts, &actions);
- tcf_action_destroy(&actions, TCA_ACT_UNBIND);
+ tcf_action_destroy(exts->actions, TCA_ACT_UNBIND);
kfree(exts->actions);
exts->nr_actions = 0;
#endif
@@ -1587,7 +1630,7 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb,
if (exts->police && tb[exts->police]) {
act = tcf_action_init_1(net, tp, tb[exts->police],
rate_tlv, "police", ovr,
- TCA_ACT_BIND, extack);
+ TCA_ACT_BIND, true, extack);
if (IS_ERR(act))
return PTR_ERR(act);
@@ -1595,17 +1638,15 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb,
exts->actions[0] = act;
exts->nr_actions = 1;
} else if (exts->action && tb[exts->action]) {
- LIST_HEAD(actions);
- int err, i = 0;
+ int err;
err = tcf_action_init(net, tp, tb[exts->action],
rate_tlv, NULL, ovr, TCA_ACT_BIND,
- &actions, &attr_size, extack);
- if (err)
+ exts->actions, &attr_size, true,
+ extack);
+ if (err < 0)
return err;
- list_for_each_entry(act, &actions, list)
- exts->actions[i++] = act;
- exts->nr_actions = i;
+ exts->nr_actions = err;
}
exts->net = net;
}
@@ -1654,14 +1695,11 @@ int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts)
* tc data even if iproute2 was newer - jhs
*/
if (exts->type != TCA_OLD_COMPAT) {
- LIST_HEAD(actions);
-
nest = nla_nest_start(skb, exts->action);
if (nest == NULL)
goto nla_put_failure;
- tcf_exts_to_list(exts, &actions);
- if (tcf_action_dump(skb, &actions, 0, 0) < 0)
+ if (tcf_action_dump(skb, exts->actions, 0, 0) < 0)
goto nla_put_failure;
nla_nest_end(skb, nest);
} else if (exts->police) {
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index 1aa7f6511065..66e0ac9811f9 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -43,6 +43,7 @@ struct cls_bpf_prog {
struct tcf_result res;
bool exts_integrated;
u32 gen_flags;
+ unsigned int in_hw_count;
struct tcf_exts exts;
u32 handle;
u16 bpf_num_ops;
@@ -174,6 +175,7 @@ static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog,
cls_bpf_offload_cmd(tp, oldprog, prog, extack);
return err;
} else if (err > 0) {
+ prog->in_hw_count = err;
tcf_block_offload_inc(block, &prog->gen_flags);
}
}
@@ -652,6 +654,42 @@ skip:
}
}
+static int cls_bpf_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb,
+ void *cb_priv, struct netlink_ext_ack *extack)
+{
+ struct cls_bpf_head *head = rtnl_dereference(tp->root);
+ struct tcf_block *block = tp->chain->block;
+ struct tc_cls_bpf_offload cls_bpf = {};
+ struct cls_bpf_prog *prog;
+ int err;
+
+ list_for_each_entry(prog, &head->plist, link) {
+ if (tc_skip_hw(prog->gen_flags))
+ continue;
+
+ tc_cls_common_offload_init(&cls_bpf.common, tp, prog->gen_flags,
+ extack);
+ cls_bpf.command = TC_CLSBPF_OFFLOAD;
+ cls_bpf.exts = &prog->exts;
+ cls_bpf.prog = add ? prog->filter : NULL;
+ cls_bpf.oldprog = add ? NULL : prog->filter;
+ cls_bpf.name = prog->bpf_name;
+ cls_bpf.exts_integrated = prog->exts_integrated;
+
+ err = cb(TC_SETUP_CLSBPF, &cls_bpf, cb_priv);
+ if (err) {
+ if (add && tc_skip_sw(prog->gen_flags))
+ return err;
+ continue;
+ }
+
+ tc_cls_offload_cnt_update(block, &prog->in_hw_count,
+ &prog->gen_flags, add);
+ }
+
+ return 0;
+}
+
static struct tcf_proto_ops cls_bpf_ops __read_mostly = {
.kind = "bpf",
.owner = THIS_MODULE,
@@ -662,6 +700,7 @@ static struct tcf_proto_ops cls_bpf_ops __read_mostly = {
.change = cls_bpf_change,
.delete = cls_bpf_delete,
.walk = cls_bpf_walk,
+ .reoffload = cls_bpf_reoffload,
.dump = cls_bpf_dump,
.bind_class = cls_bpf_bind_class,
};
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 9e8b26a80fb3..38d74803e2df 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -35,6 +35,7 @@ struct fl_flow_key {
struct flow_dissector_key_basic basic;
struct flow_dissector_key_eth_addrs eth;
struct flow_dissector_key_vlan vlan;
+ struct flow_dissector_key_vlan cvlan;
union {
struct flow_dissector_key_ipv4_addrs ipv4;
struct flow_dissector_key_ipv6_addrs ipv6;
@@ -51,6 +52,7 @@ struct fl_flow_key {
struct flow_dissector_key_mpls mpls;
struct flow_dissector_key_tcp tcp;
struct flow_dissector_key_ip ip;
+ struct flow_dissector_key_ip enc_ip;
} __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */
struct fl_flow_mask_range {
@@ -87,6 +89,7 @@ struct cls_fl_filter {
struct list_head list;
u32 handle;
u32 flags;
+ unsigned int in_hw_count;
struct rcu_work rwork;
struct net_device *hw_dev;
};
@@ -289,6 +292,7 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
fl_hw_destroy_filter(tp, f, NULL);
return err;
} else if (err > 0) {
+ f->in_hw_count = err;
tcf_block_offload_inc(block, &f->flags);
}
@@ -447,6 +451,13 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = {
[TCA_FLOWER_KEY_IP_TOS_MASK] = { .type = NLA_U8 },
[TCA_FLOWER_KEY_IP_TTL] = { .type = NLA_U8 },
[TCA_FLOWER_KEY_IP_TTL_MASK] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_CVLAN_ID] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_CVLAN_PRIO] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_CVLAN_ETH_TYPE] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_ENC_IP_TOS] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_ENC_IP_TOS_MASK] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_ENC_IP_TTL] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_ENC_IP_TTL_MASK] = { .type = NLA_U8 },
};
static void fl_set_key_val(struct nlattr **tb,
@@ -498,22 +509,26 @@ static int fl_set_key_mpls(struct nlattr **tb,
}
static void fl_set_key_vlan(struct nlattr **tb,
+ __be16 ethertype,
+ int vlan_id_key, int vlan_prio_key,
struct flow_dissector_key_vlan *key_val,
struct flow_dissector_key_vlan *key_mask)
{
#define VLAN_PRIORITY_MASK 0x7
- if (tb[TCA_FLOWER_KEY_VLAN_ID]) {
+ if (tb[vlan_id_key]) {
key_val->vlan_id =
- nla_get_u16(tb[TCA_FLOWER_KEY_VLAN_ID]) & VLAN_VID_MASK;
+ nla_get_u16(tb[vlan_id_key]) & VLAN_VID_MASK;
key_mask->vlan_id = VLAN_VID_MASK;
}
- if (tb[TCA_FLOWER_KEY_VLAN_PRIO]) {
+ if (tb[vlan_prio_key]) {
key_val->vlan_priority =
- nla_get_u8(tb[TCA_FLOWER_KEY_VLAN_PRIO]) &
+ nla_get_u8(tb[vlan_prio_key]) &
VLAN_PRIORITY_MASK;
key_mask->vlan_priority = VLAN_PRIORITY_MASK;
}
+ key_val->vlan_tpid = ethertype;
+ key_mask->vlan_tpid = cpu_to_be16(~0);
}
static void fl_set_key_flag(u32 flower_key, u32 flower_mask,
@@ -551,17 +566,17 @@ static int fl_set_key_flags(struct nlattr **tb,
return 0;
}
-static void fl_set_key_ip(struct nlattr **tb,
+static void fl_set_key_ip(struct nlattr **tb, bool encap,
struct flow_dissector_key_ip *key,
struct flow_dissector_key_ip *mask)
{
- fl_set_key_val(tb, &key->tos, TCA_FLOWER_KEY_IP_TOS,
- &mask->tos, TCA_FLOWER_KEY_IP_TOS_MASK,
- sizeof(key->tos));
+ int tos_key = encap ? TCA_FLOWER_KEY_ENC_IP_TOS : TCA_FLOWER_KEY_IP_TOS;
+ int ttl_key = encap ? TCA_FLOWER_KEY_ENC_IP_TTL : TCA_FLOWER_KEY_IP_TTL;
+ int tos_mask = encap ? TCA_FLOWER_KEY_ENC_IP_TOS_MASK : TCA_FLOWER_KEY_IP_TOS_MASK;
+ int ttl_mask = encap ? TCA_FLOWER_KEY_ENC_IP_TTL_MASK : TCA_FLOWER_KEY_IP_TTL_MASK;
- fl_set_key_val(tb, &key->ttl, TCA_FLOWER_KEY_IP_TTL,
- &mask->ttl, TCA_FLOWER_KEY_IP_TTL_MASK,
- sizeof(key->ttl));
+ fl_set_key_val(tb, &key->tos, tos_key, &mask->tos, tos_mask, sizeof(key->tos));
+ fl_set_key_val(tb, &key->ttl, ttl_key, &mask->ttl, ttl_mask, sizeof(key->ttl));
}
static int fl_set_key(struct net *net, struct nlattr **tb,
@@ -590,12 +605,28 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
if (tb[TCA_FLOWER_KEY_ETH_TYPE]) {
ethertype = nla_get_be16(tb[TCA_FLOWER_KEY_ETH_TYPE]);
- if (ethertype == htons(ETH_P_8021Q)) {
- fl_set_key_vlan(tb, &key->vlan, &mask->vlan);
- fl_set_key_val(tb, &key->basic.n_proto,
- TCA_FLOWER_KEY_VLAN_ETH_TYPE,
- &mask->basic.n_proto, TCA_FLOWER_UNSPEC,
- sizeof(key->basic.n_proto));
+ if (eth_type_vlan(ethertype)) {
+ fl_set_key_vlan(tb, ethertype, TCA_FLOWER_KEY_VLAN_ID,
+ TCA_FLOWER_KEY_VLAN_PRIO, &key->vlan,
+ &mask->vlan);
+
+ if (tb[TCA_FLOWER_KEY_VLAN_ETH_TYPE]) {
+ ethertype = nla_get_be16(tb[TCA_FLOWER_KEY_VLAN_ETH_TYPE]);
+ if (eth_type_vlan(ethertype)) {
+ fl_set_key_vlan(tb, ethertype,
+ TCA_FLOWER_KEY_CVLAN_ID,
+ TCA_FLOWER_KEY_CVLAN_PRIO,
+ &key->cvlan, &mask->cvlan);
+ fl_set_key_val(tb, &key->basic.n_proto,
+ TCA_FLOWER_KEY_CVLAN_ETH_TYPE,
+ &mask->basic.n_proto,
+ TCA_FLOWER_UNSPEC,
+ sizeof(key->basic.n_proto));
+ } else {
+ key->basic.n_proto = ethertype;
+ mask->basic.n_proto = cpu_to_be16(~0);
+ }
+ }
} else {
key->basic.n_proto = ethertype;
mask->basic.n_proto = cpu_to_be16(~0);
@@ -607,7 +638,7 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
fl_set_key_val(tb, &key->basic.ip_proto, TCA_FLOWER_KEY_IP_PROTO,
&mask->basic.ip_proto, TCA_FLOWER_UNSPEC,
sizeof(key->basic.ip_proto));
- fl_set_key_ip(tb, &key->ip, &mask->ip);
+ fl_set_key_ip(tb, false, &key->ip, &mask->ip);
}
if (tb[TCA_FLOWER_KEY_IPV4_SRC] || tb[TCA_FLOWER_KEY_IPV4_DST]) {
@@ -742,6 +773,8 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
&mask->enc_tp.dst, TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK,
sizeof(key->enc_tp.dst));
+ fl_set_key_ip(tb, true, &key->enc_ip, &mask->enc_ip);
+
if (tb[TCA_FLOWER_KEY_FLAGS])
ret = fl_set_key_flags(tb, &key->control.flags, &mask->control.flags);
@@ -821,6 +854,8 @@ static void fl_init_dissector(struct fl_flow_mask *mask)
FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
FLOW_DISSECTOR_KEY_VLAN, vlan);
FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
+ FLOW_DISSECTOR_KEY_CVLAN, cvlan);
+ FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
FLOW_DISSECTOR_KEY_ENC_KEYID, enc_key_id);
FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, enc_ipv4);
@@ -832,6 +867,8 @@ static void fl_init_dissector(struct fl_flow_mask *mask)
enc_control);
FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
FLOW_DISSECTOR_KEY_ENC_PORTS, enc_tp);
+ FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
+ FLOW_DISSECTOR_KEY_ENC_IP, enc_ip);
skb_flow_dissector_init(&mask->dissector, keys, cnt);
}
@@ -1071,20 +1108,59 @@ static void fl_walk(struct tcf_proto *tp, struct tcf_walker *arg)
{
struct cls_fl_head *head = rtnl_dereference(tp->root);
struct cls_fl_filter *f;
+
+ arg->count = arg->skip;
+
+ while ((f = idr_get_next_ul(&head->handle_idr,
+ &arg->cookie)) != NULL) {
+ if (arg->fn(tp, f, arg) < 0) {
+ arg->stop = 1;
+ break;
+ }
+ arg->cookie = f->handle + 1;
+ arg->count++;
+ }
+}
+
+static int fl_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb,
+ void *cb_priv, struct netlink_ext_ack *extack)
+{
+ struct cls_fl_head *head = rtnl_dereference(tp->root);
+ struct tc_cls_flower_offload cls_flower = {};
+ struct tcf_block *block = tp->chain->block;
struct fl_flow_mask *mask;
+ struct cls_fl_filter *f;
+ int err;
- list_for_each_entry_rcu(mask, &head->masks, list) {
- list_for_each_entry_rcu(f, &mask->filters, list) {
- if (arg->count < arg->skip)
- goto skip;
- if (arg->fn(tp, f, arg) < 0) {
- arg->stop = 1;
- break;
+ list_for_each_entry(mask, &head->masks, list) {
+ list_for_each_entry(f, &mask->filters, list) {
+ if (tc_skip_hw(f->flags))
+ continue;
+
+ tc_cls_common_offload_init(&cls_flower.common, tp,
+ f->flags, extack);
+ cls_flower.command = add ?
+ TC_CLSFLOWER_REPLACE : TC_CLSFLOWER_DESTROY;
+ cls_flower.cookie = (unsigned long)f;
+ cls_flower.dissector = &mask->dissector;
+ cls_flower.mask = &f->mkey;
+ cls_flower.key = &f->key;
+ cls_flower.exts = &f->exts;
+ cls_flower.classid = f->res.classid;
+
+ err = cb(TC_SETUP_CLSFLOWER, &cls_flower, cb_priv);
+ if (err) {
+ if (add && tc_skip_sw(f->flags))
+ return err;
+ continue;
}
-skip:
- arg->count++;
+
+ tc_cls_offload_cnt_update(block, &f->in_hw_count,
+ &f->flags, add);
}
}
+
+ return 0;
}
static int fl_dump_key_val(struct sk_buff *skb,
@@ -1141,20 +1217,24 @@ static int fl_dump_key_mpls(struct sk_buff *skb,
return 0;
}
-static int fl_dump_key_ip(struct sk_buff *skb,
+static int fl_dump_key_ip(struct sk_buff *skb, bool encap,
struct flow_dissector_key_ip *key,
struct flow_dissector_key_ip *mask)
{
- if (fl_dump_key_val(skb, &key->tos, TCA_FLOWER_KEY_IP_TOS, &mask->tos,
- TCA_FLOWER_KEY_IP_TOS_MASK, sizeof(key->tos)) ||
- fl_dump_key_val(skb, &key->ttl, TCA_FLOWER_KEY_IP_TTL, &mask->ttl,
- TCA_FLOWER_KEY_IP_TTL_MASK, sizeof(key->ttl)))
+ int tos_key = encap ? TCA_FLOWER_KEY_ENC_IP_TOS : TCA_FLOWER_KEY_IP_TOS;
+ int ttl_key = encap ? TCA_FLOWER_KEY_ENC_IP_TTL : TCA_FLOWER_KEY_IP_TTL;
+ int tos_mask = encap ? TCA_FLOWER_KEY_ENC_IP_TOS_MASK : TCA_FLOWER_KEY_IP_TOS_MASK;
+ int ttl_mask = encap ? TCA_FLOWER_KEY_ENC_IP_TTL_MASK : TCA_FLOWER_KEY_IP_TTL_MASK;
+
+ if (fl_dump_key_val(skb, &key->tos, tos_key, &mask->tos, tos_mask, sizeof(key->tos)) ||
+ fl_dump_key_val(skb, &key->ttl, ttl_key, &mask->ttl, ttl_mask, sizeof(key->ttl)))
return -1;
return 0;
}
static int fl_dump_key_vlan(struct sk_buff *skb,
+ int vlan_id_key, int vlan_prio_key,
struct flow_dissector_key_vlan *vlan_key,
struct flow_dissector_key_vlan *vlan_mask)
{
@@ -1163,13 +1243,13 @@ static int fl_dump_key_vlan(struct sk_buff *skb,
if (!memchr_inv(vlan_mask, 0, sizeof(*vlan_mask)))
return 0;
if (vlan_mask->vlan_id) {
- err = nla_put_u16(skb, TCA_FLOWER_KEY_VLAN_ID,
+ err = nla_put_u16(skb, vlan_id_key,
vlan_key->vlan_id);
if (err)
return err;
}
if (vlan_mask->vlan_priority) {
- err = nla_put_u8(skb, TCA_FLOWER_KEY_VLAN_PRIO,
+ err = nla_put_u8(skb, vlan_prio_key,
vlan_key->vlan_priority);
if (err)
return err;
@@ -1264,15 +1344,36 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh,
if (fl_dump_key_mpls(skb, &key->mpls, &mask->mpls))
goto nla_put_failure;
- if (fl_dump_key_vlan(skb, &key->vlan, &mask->vlan))
+ if (fl_dump_key_vlan(skb, TCA_FLOWER_KEY_VLAN_ID,
+ TCA_FLOWER_KEY_VLAN_PRIO, &key->vlan, &mask->vlan))
+ goto nla_put_failure;
+
+ if (fl_dump_key_vlan(skb, TCA_FLOWER_KEY_CVLAN_ID,
+ TCA_FLOWER_KEY_CVLAN_PRIO,
+ &key->cvlan, &mask->cvlan) ||
+ (mask->cvlan.vlan_tpid &&
+ nla_put_u16(skb, TCA_FLOWER_KEY_VLAN_ETH_TYPE,
+ key->cvlan.vlan_tpid)))
goto nla_put_failure;
+ if (mask->basic.n_proto) {
+ if (mask->cvlan.vlan_tpid) {
+ if (nla_put_be16(skb, TCA_FLOWER_KEY_CVLAN_ETH_TYPE,
+ key->basic.n_proto))
+ goto nla_put_failure;
+ } else if (mask->vlan.vlan_tpid) {
+ if (nla_put_be16(skb, TCA_FLOWER_KEY_VLAN_ETH_TYPE,
+ key->basic.n_proto))
+ goto nla_put_failure;
+ }
+ }
+
if ((key->basic.n_proto == htons(ETH_P_IP) ||
key->basic.n_proto == htons(ETH_P_IPV6)) &&
(fl_dump_key_val(skb, &key->basic.ip_proto, TCA_FLOWER_KEY_IP_PROTO,
&mask->basic.ip_proto, TCA_FLOWER_UNSPEC,
sizeof(key->basic.ip_proto)) ||
- fl_dump_key_ip(skb, &key->ip, &mask->ip)))
+ fl_dump_key_ip(skb, false, &key->ip, &mask->ip)))
goto nla_put_failure;
if (key->control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS &&
@@ -1397,7 +1498,8 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh,
TCA_FLOWER_KEY_ENC_UDP_DST_PORT,
&mask->enc_tp.dst,
TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK,
- sizeof(key->enc_tp.dst)))
+ sizeof(key->enc_tp.dst)) ||
+ fl_dump_key_ip(skb, true, &key->enc_ip, &mask->enc_ip))
goto nla_put_failure;
if (fl_dump_key_flags(skb, key->control.flags, mask->control.flags))
@@ -1438,6 +1540,7 @@ static struct tcf_proto_ops cls_fl_ops __read_mostly = {
.change = fl_change,
.delete = fl_delete,
.walk = fl_walk,
+ .reoffload = fl_reoffload,
.dump = fl_dump,
.bind_class = fl_bind_class,
.owner = THIS_MODULE,
diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index 47b207ef7762..af16f36ed578 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -21,6 +21,7 @@ struct cls_mall_head {
struct tcf_result res;
u32 handle;
u32 flags;
+ unsigned int in_hw_count;
struct rcu_work rwork;
};
@@ -95,6 +96,7 @@ static int mall_replace_hw_filter(struct tcf_proto *tp,
mall_destroy_hw_filter(tp, head, cookie, NULL);
return err;
} else if (err > 0) {
+ head->in_hw_count = err;
tcf_block_offload_inc(block, &head->flags);
}
@@ -235,6 +237,35 @@ skip:
arg->count++;
}
+static int mall_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb,
+ void *cb_priv, struct netlink_ext_ack *extack)
+{
+ struct cls_mall_head *head = rtnl_dereference(tp->root);
+ struct tc_cls_matchall_offload cls_mall = {};
+ struct tcf_block *block = tp->chain->block;
+ int err;
+
+ if (tc_skip_hw(head->flags))
+ return 0;
+
+ tc_cls_common_offload_init(&cls_mall.common, tp, head->flags, extack);
+ cls_mall.command = add ?
+ TC_CLSMATCHALL_REPLACE : TC_CLSMATCHALL_DESTROY;
+ cls_mall.exts = &head->exts;
+ cls_mall.cookie = (unsigned long)head;
+
+ err = cb(TC_SETUP_CLSMATCHALL, &cls_mall, cb_priv);
+ if (err) {
+ if (add && tc_skip_sw(head->flags))
+ return err;
+ return 0;
+ }
+
+ tc_cls_offload_cnt_update(block, &head->in_hw_count, &head->flags, add);
+
+ return 0;
+}
+
static int mall_dump(struct net *net, struct tcf_proto *tp, void *fh,
struct sk_buff *skb, struct tcmsg *t)
{
@@ -289,6 +320,7 @@ static struct tcf_proto_ops cls_mall_ops __read_mostly = {
.change = mall_change,
.delete = mall_delete,
.walk = mall_walk,
+ .reoffload = mall_reoffload,
.dump = mall_dump,
.bind_class = mall_bind_class,
.owner = THIS_MODULE,
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index fb861f90fde6..d5d2a6dc3921 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -62,6 +62,7 @@ struct tc_u_knode {
struct tc_u32_pcnt __percpu *pf;
#endif
u32 flags;
+ unsigned int in_hw_count;
#ifdef CONFIG_CLS_U32_MARK
u32 val;
u32 mask;
@@ -571,6 +572,7 @@ static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n,
u32_remove_hw_knode(tp, n, NULL);
return err;
} else if (err > 0) {
+ n->in_hw_count = err;
tcf_block_offload_inc(block, &n->flags);
}
@@ -1199,6 +1201,114 @@ static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg)
}
}
+static int u32_reoffload_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht,
+ bool add, tc_setup_cb_t *cb, void *cb_priv,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_cls_u32_offload cls_u32 = {};
+ int err;
+
+ tc_cls_common_offload_init(&cls_u32.common, tp, ht->flags, extack);
+ cls_u32.command = add ? TC_CLSU32_NEW_HNODE : TC_CLSU32_DELETE_HNODE;
+ cls_u32.hnode.divisor = ht->divisor;
+ cls_u32.hnode.handle = ht->handle;
+ cls_u32.hnode.prio = ht->prio;
+
+ err = cb(TC_SETUP_CLSU32, &cls_u32, cb_priv);
+ if (err && add && tc_skip_sw(ht->flags))
+ return err;
+
+ return 0;
+}
+
+static int u32_reoffload_knode(struct tcf_proto *tp, struct tc_u_knode *n,
+ bool add, tc_setup_cb_t *cb, void *cb_priv,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_u_hnode *ht = rtnl_dereference(n->ht_down);
+ struct tcf_block *block = tp->chain->block;
+ struct tc_cls_u32_offload cls_u32 = {};
+ int err;
+
+ tc_cls_common_offload_init(&cls_u32.common, tp, n->flags, extack);
+ cls_u32.command = add ?
+ TC_CLSU32_REPLACE_KNODE : TC_CLSU32_DELETE_KNODE;
+ cls_u32.knode.handle = n->handle;
+
+ if (add) {
+ cls_u32.knode.fshift = n->fshift;
+#ifdef CONFIG_CLS_U32_MARK
+ cls_u32.knode.val = n->val;
+ cls_u32.knode.mask = n->mask;
+#else
+ cls_u32.knode.val = 0;
+ cls_u32.knode.mask = 0;
+#endif
+ cls_u32.knode.sel = &n->sel;
+ cls_u32.knode.exts = &n->exts;
+ if (n->ht_down)
+ cls_u32.knode.link_handle = ht->handle;
+ }
+
+ err = cb(TC_SETUP_CLSU32, &cls_u32, cb_priv);
+ if (err) {
+ if (add && tc_skip_sw(n->flags))
+ return err;
+ return 0;
+ }
+
+ tc_cls_offload_cnt_update(block, &n->in_hw_count, &n->flags, add);
+
+ return 0;
+}
+
+static int u32_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb,
+ void *cb_priv, struct netlink_ext_ack *extack)
+{
+ struct tc_u_common *tp_c = tp->data;
+ struct tc_u_hnode *ht;
+ struct tc_u_knode *n;
+ unsigned int h;
+ int err;
+
+ for (ht = rtnl_dereference(tp_c->hlist);
+ ht;
+ ht = rtnl_dereference(ht->next)) {
+ if (ht->prio != tp->prio)
+ continue;
+
+ /* When adding filters to a new dev, try to offload the
+ * hashtable first. When removing, do the filters before the
+ * hashtable.
+ */
+ if (add && !tc_skip_hw(ht->flags)) {
+ err = u32_reoffload_hnode(tp, ht, add, cb, cb_priv,
+ extack);
+ if (err)
+ return err;
+ }
+
+ for (h = 0; h <= ht->divisor; h++) {
+ for (n = rtnl_dereference(ht->ht[h]);
+ n;
+ n = rtnl_dereference(n->next)) {
+ if (tc_skip_hw(n->flags))
+ continue;
+
+ err = u32_reoffload_knode(tp, n, add, cb,
+ cb_priv, extack);
+ if (err)
+ return err;
+ }
+ }
+
+ if (!add && !tc_skip_hw(ht->flags))
+ u32_reoffload_hnode(tp, ht, add, cb, cb_priv, extack);
+ }
+
+ return 0;
+}
+
static void u32_bind_class(void *fh, u32 classid, unsigned long cl)
{
struct tc_u_knode *n = fh;
@@ -1336,6 +1446,7 @@ static struct tcf_proto_ops cls_u32_ops __read_mostly = {
.change = u32_change,
.delete = u32_delete,
.walk = u32_walk,
+ .reoffload = u32_reoffload,
.dump = u32_dump,
.bind_class = u32_bind_class,
.owner = THIS_MODULE,
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 54eca685420f..98541c6399db 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -596,12 +596,19 @@ static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
-void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
+void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
+ clockid_t clockid)
{
- hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
+ hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED);
wd->timer.function = qdisc_watchdog;
wd->qdisc = qdisc;
}
+EXPORT_SYMBOL(qdisc_watchdog_init_clockid);
+
+void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
+{
+ qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC);
+}
EXPORT_SYMBOL(qdisc_watchdog_init);
void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires)
diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c
new file mode 100644
index 000000000000..539c9490c308
--- /dev/null
+++ b/net/sched/sch_cake.c
@@ -0,0 +1,3019 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+
+/* COMMON Applications Kept Enhanced (CAKE) discipline
+ *
+ * Copyright (C) 2014-2018 Jonathan Morton <chromatix99@gmail.com>
+ * Copyright (C) 2015-2018 Toke Høiland-Jørgensen <toke@toke.dk>
+ * Copyright (C) 2014-2018 Dave Täht <dave.taht@gmail.com>
+ * Copyright (C) 2015-2018 Sebastian Moeller <moeller0@gmx.de>
+ * (C) 2015-2018 Kevin Darbyshire-Bryant <kevin@darbyshire-bryant.me.uk>
+ * Copyright (C) 2017-2018 Ryan Mounce <ryan@mounce.com.au>
+ *
+ * The CAKE Principles:
+ * (or, how to have your cake and eat it too)
+ *
+ * This is a combination of several shaping, AQM and FQ techniques into one
+ * easy-to-use package:
+ *
+ * - An overall bandwidth shaper, to move the bottleneck away from dumb CPE
+ * equipment and bloated MACs. This operates in deficit mode (as in sch_fq),
+ * eliminating the need for any sort of burst parameter (eg. token bucket
+ * depth). Burst support is limited to that necessary to overcome scheduling
+ * latency.
+ *
+ * - A Diffserv-aware priority queue, giving more priority to certain classes,
+ * up to a specified fraction of bandwidth. Above that bandwidth threshold,
+ * the priority is reduced to avoid starving other tins.
+ *
+ * - Each priority tin has a separate Flow Queue system, to isolate traffic
+ * flows from each other. This prevents a burst on one flow from increasing
+ * the delay to another. Flows are distributed to queues using a
+ * set-associative hash function.
+ *
+ * - Each queue is actively managed by Cobalt, which is a combination of the
+ * Codel and Blue AQM algorithms. This serves flows fairly, and signals
+ * congestion early via ECN (if available) and/or packet drops, to keep
+ * latency low. The codel parameters are auto-tuned based on the bandwidth
+ * setting, as is necessary at low bandwidths.
+ *
+ * The configuration parameters are kept deliberately simple for ease of use.
+ * Everything has sane defaults. Complete generality of configuration is *not*
+ * a goal.
+ *
+ * The priority queue operates according to a weighted DRR scheme, combined with
+ * a bandwidth tracker which reuses the shaper logic to detect which side of the
+ * bandwidth sharing threshold the tin is operating. This determines whether a
+ * priority-based weight (high) or a bandwidth-based weight (low) is used for
+ * that tin in the current pass.
+ *
+ * This qdisc was inspired by Eric Dumazet's fq_codel code, which he kindly
+ * granted us permission to leverage.
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/string.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/jhash.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/reciprocal_div.h>
+#include <net/netlink.h>
+#include <linux/version.h>
+#include <linux/if_vlan.h>
+#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
+#include <net/tcp.h>
+#include <net/flow_dissector.h>
+
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+#include <net/netfilter/nf_conntrack_core.h>
+#endif
+
+#define CAKE_SET_WAYS (8)
+#define CAKE_MAX_TINS (8)
+#define CAKE_QUEUES (1024)
+#define CAKE_FLOW_MASK 63
+#define CAKE_FLOW_NAT_FLAG 64
+#define CAKE_SPLIT_GSO_THRESHOLD (125000000) /* 1Gbps */
+
+/* struct cobalt_params - contains codel and blue parameters
+ * @interval: codel initial drop rate
+ * @target: maximum persistent sojourn time & blue update rate
+ * @mtu_time: serialisation delay of maximum-size packet
+ * @p_inc: increment of blue drop probability (0.32 fxp)
+ * @p_dec: decrement of blue drop probability (0.32 fxp)
+ */
+struct cobalt_params {
+ u64 interval;
+ u64 target;
+ u64 mtu_time;
+ u32 p_inc;
+ u32 p_dec;
+};
+
+/* struct cobalt_vars - contains codel and blue variables
+ * @count: codel dropping frequency
+ * @rec_inv_sqrt: reciprocal value of sqrt(count) >> 1
+ * @drop_next: time to drop next packet, or when we dropped last
+ * @blue_timer: Blue time to next drop
+ * @p_drop: BLUE drop probability (0.32 fxp)
+ * @dropping: set if in dropping state
+ * @ecn_marked: set if marked
+ */
+struct cobalt_vars {
+ u32 count;
+ u32 rec_inv_sqrt;
+ ktime_t drop_next;
+ ktime_t blue_timer;
+ u32 p_drop;
+ bool dropping;
+ bool ecn_marked;
+};
+
+enum {
+ CAKE_SET_NONE = 0,
+ CAKE_SET_SPARSE,
+ CAKE_SET_SPARSE_WAIT, /* counted in SPARSE, actually in BULK */
+ CAKE_SET_BULK,
+ CAKE_SET_DECAYING
+};
+
+struct cake_flow {
+ /* this stuff is all needed per-flow at dequeue time */
+ struct sk_buff *head;
+ struct sk_buff *tail;
+ struct list_head flowchain;
+ s32 deficit;
+ u32 dropped;
+ struct cobalt_vars cvars;
+ u16 srchost; /* index into cake_host table */
+ u16 dsthost;
+ u8 set;
+}; /* please try to keep this structure <= 64 bytes */
+
+struct cake_host {
+ u32 srchost_tag;
+ u32 dsthost_tag;
+ u16 srchost_refcnt;
+ u16 dsthost_refcnt;
+};
+
+struct cake_heap_entry {
+ u16 t:3, b:10;
+};
+
+struct cake_tin_data {
+ struct cake_flow flows[CAKE_QUEUES];
+ u32 backlogs[CAKE_QUEUES];
+ u32 tags[CAKE_QUEUES]; /* for set association */
+ u16 overflow_idx[CAKE_QUEUES];
+ struct cake_host hosts[CAKE_QUEUES]; /* for triple isolation */
+ u16 flow_quantum;
+
+ struct cobalt_params cparams;
+ u32 drop_overlimit;
+ u16 bulk_flow_count;
+ u16 sparse_flow_count;
+ u16 decaying_flow_count;
+ u16 unresponsive_flow_count;
+
+ u32 max_skblen;
+
+ struct list_head new_flows;
+ struct list_head old_flows;
+ struct list_head decaying_flows;
+
+ /* time_next = time_this + ((len * rate_ns) >> rate_shft) */
+ ktime_t time_next_packet;
+ u64 tin_rate_ns;
+ u64 tin_rate_bps;
+ u16 tin_rate_shft;
+
+ u16 tin_quantum_prio;
+ u16 tin_quantum_band;
+ s32 tin_deficit;
+ u32 tin_backlog;
+ u32 tin_dropped;
+ u32 tin_ecn_mark;
+
+ u32 packets;
+ u64 bytes;
+
+ u32 ack_drops;
+
+ /* moving averages */
+ u64 avge_delay;
+ u64 peak_delay;
+ u64 base_delay;
+
+ /* hash function stats */
+ u32 way_directs;
+ u32 way_hits;
+ u32 way_misses;
+ u32 way_collisions;
+}; /* number of tins is small, so size of this struct doesn't matter much */
+
+struct cake_sched_data {
+ struct tcf_proto __rcu *filter_list; /* optional external classifier */
+ struct tcf_block *block;
+ struct cake_tin_data *tins;
+
+ struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS];
+ u16 overflow_timeout;
+
+ u16 tin_cnt;
+ u8 tin_mode;
+ u8 flow_mode;
+ u8 ack_filter;
+ u8 atm_mode;
+
+ /* time_next = time_this + ((len * rate_ns) >> rate_shft) */
+ u16 rate_shft;
+ ktime_t time_next_packet;
+ ktime_t failsafe_next_packet;
+ u64 rate_ns;
+ u64 rate_bps;
+ u16 rate_flags;
+ s16 rate_overhead;
+ u16 rate_mpu;
+ u64 interval;
+ u64 target;
+
+ /* resource tracking */
+ u32 buffer_used;
+ u32 buffer_max_used;
+ u32 buffer_limit;
+ u32 buffer_config_limit;
+
+ /* indices for dequeue */
+ u16 cur_tin;
+ u16 cur_flow;
+
+ struct qdisc_watchdog watchdog;
+ const u8 *tin_index;
+ const u8 *tin_order;
+
+ /* bandwidth capacity estimate */
+ ktime_t last_packet_time;
+ ktime_t avg_window_begin;
+ u64 avg_packet_interval;
+ u64 avg_window_bytes;
+ u64 avg_peak_bandwidth;
+ ktime_t last_reconfig_time;
+
+ /* packet length stats */
+ u32 avg_netoff;
+ u16 max_netlen;
+ u16 max_adjlen;
+ u16 min_netlen;
+ u16 min_adjlen;
+};
+
+enum {
+ CAKE_FLAG_OVERHEAD = BIT(0),
+ CAKE_FLAG_AUTORATE_INGRESS = BIT(1),
+ CAKE_FLAG_INGRESS = BIT(2),
+ CAKE_FLAG_WASH = BIT(3),
+ CAKE_FLAG_SPLIT_GSO = BIT(4)
+};
+
+/* COBALT operates the Codel and BLUE algorithms in parallel, in order to
+ * obtain the best features of each. Codel is excellent on flows which
+ * respond to congestion signals in a TCP-like way. BLUE is more effective on
+ * unresponsive flows.
+ */
+
+struct cobalt_skb_cb {
+ ktime_t enqueue_time;
+ u32 adjusted_len;
+};
+
+static u64 us_to_ns(u64 us)
+{
+ return us * NSEC_PER_USEC;
+}
+
+static struct cobalt_skb_cb *get_cobalt_cb(const struct sk_buff *skb)
+{
+ qdisc_cb_private_validate(skb, sizeof(struct cobalt_skb_cb));
+ return (struct cobalt_skb_cb *)qdisc_skb_cb(skb)->data;
+}
+
+static ktime_t cobalt_get_enqueue_time(const struct sk_buff *skb)
+{
+ return get_cobalt_cb(skb)->enqueue_time;
+}
+
+static void cobalt_set_enqueue_time(struct sk_buff *skb,
+ ktime_t now)
+{
+ get_cobalt_cb(skb)->enqueue_time = now;
+}
+
+static u16 quantum_div[CAKE_QUEUES + 1] = {0};
+
+/* Diffserv lookup tables */
+
+static const u8 precedence[] = {
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 2, 2, 2, 2, 2, 2, 2, 2,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 4, 4, 4, 4, 4, 4, 4, 4,
+ 5, 5, 5, 5, 5, 5, 5, 5,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+};
+
+static const u8 diffserv8[] = {
+ 2, 5, 1, 2, 4, 2, 2, 2,
+ 0, 2, 1, 2, 1, 2, 1, 2,
+ 5, 2, 4, 2, 4, 2, 4, 2,
+ 3, 2, 3, 2, 3, 2, 3, 2,
+ 6, 2, 3, 2, 3, 2, 3, 2,
+ 6, 2, 2, 2, 6, 2, 6, 2,
+ 7, 2, 2, 2, 2, 2, 2, 2,
+ 7, 2, 2, 2, 2, 2, 2, 2,
+};
+
+static const u8 diffserv4[] = {
+ 0, 2, 0, 0, 2, 0, 0, 0,
+ 1, 0, 0, 0, 0, 0, 0, 0,
+ 2, 0, 2, 0, 2, 0, 2, 0,
+ 2, 0, 2, 0, 2, 0, 2, 0,
+ 3, 0, 2, 0, 2, 0, 2, 0,
+ 3, 0, 0, 0, 3, 0, 3, 0,
+ 3, 0, 0, 0, 0, 0, 0, 0,
+ 3, 0, 0, 0, 0, 0, 0, 0,
+};
+
+static const u8 diffserv3[] = {
+ 0, 0, 0, 0, 2, 0, 0, 0,
+ 1, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 2, 0, 2, 0,
+ 2, 0, 0, 0, 0, 0, 0, 0,
+ 2, 0, 0, 0, 0, 0, 0, 0,
+};
+
+static const u8 besteffort[] = {
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+/* tin priority order for stats dumping */
+
+static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7};
+static const u8 bulk_order[] = {1, 0, 2, 3};
+
+#define REC_INV_SQRT_CACHE (16)
+static u32 cobalt_rec_inv_sqrt_cache[REC_INV_SQRT_CACHE] = {0};
+
+/* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots
+ * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2)
+ *
+ * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32
+ */
+
+static void cobalt_newton_step(struct cobalt_vars *vars)
+{
+ u32 invsqrt, invsqrt2;
+ u64 val;
+
+ invsqrt = vars->rec_inv_sqrt;
+ invsqrt2 = ((u64)invsqrt * invsqrt) >> 32;
+ val = (3LL << 32) - ((u64)vars->count * invsqrt2);
+
+ val >>= 2; /* avoid overflow in following multiply */
+ val = (val * invsqrt) >> (32 - 2 + 1);
+
+ vars->rec_inv_sqrt = val;
+}
+
+static void cobalt_invsqrt(struct cobalt_vars *vars)
+{
+ if (vars->count < REC_INV_SQRT_CACHE)
+ vars->rec_inv_sqrt = cobalt_rec_inv_sqrt_cache[vars->count];
+ else
+ cobalt_newton_step(vars);
+}
+
+/* There is a big difference in timing between the accurate values placed in
+ * the cache and the approximations given by a single Newton step for small
+ * count values, particularly when stepping from count 1 to 2 or vice versa.
+ * Above 16, a single Newton step gives sufficient accuracy in either
+ * direction, given the precision stored.
+ *
+ * The magnitude of the error when stepping up to count 2 is such as to give
+ * the value that *should* have been produced at count 4.
+ */
+
+static void cobalt_cache_init(void)
+{
+ struct cobalt_vars v;
+
+ memset(&v, 0, sizeof(v));
+ v.rec_inv_sqrt = ~0U;
+ cobalt_rec_inv_sqrt_cache[0] = v.rec_inv_sqrt;
+
+ for (v.count = 1; v.count < REC_INV_SQRT_CACHE; v.count++) {
+ cobalt_newton_step(&v);
+ cobalt_newton_step(&v);
+ cobalt_newton_step(&v);
+ cobalt_newton_step(&v);
+
+ cobalt_rec_inv_sqrt_cache[v.count] = v.rec_inv_sqrt;
+ }
+}
+
+static void cobalt_vars_init(struct cobalt_vars *vars)
+{
+ memset(vars, 0, sizeof(*vars));
+
+ if (!cobalt_rec_inv_sqrt_cache[0]) {
+ cobalt_cache_init();
+ cobalt_rec_inv_sqrt_cache[0] = ~0;
+ }
+}
+
+/* CoDel control_law is t + interval/sqrt(count)
+ * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid
+ * both sqrt() and divide operation.
+ */
+static ktime_t cobalt_control(ktime_t t,
+ u64 interval,
+ u32 rec_inv_sqrt)
+{
+ return ktime_add_ns(t, reciprocal_scale(interval,
+ rec_inv_sqrt));
+}
+
+/* Call this when a packet had to be dropped due to queue overflow. Returns
+ * true if the BLUE state was quiescent before but active after this call.
+ */
+static bool cobalt_queue_full(struct cobalt_vars *vars,
+ struct cobalt_params *p,
+ ktime_t now)
+{
+ bool up = false;
+
+ if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
+ up = !vars->p_drop;
+ vars->p_drop += p->p_inc;
+ if (vars->p_drop < p->p_inc)
+ vars->p_drop = ~0;
+ vars->blue_timer = now;
+ }
+ vars->dropping = true;
+ vars->drop_next = now;
+ if (!vars->count)
+ vars->count = 1;
+
+ return up;
+}
+
+/* Call this when the queue was serviced but turned out to be empty. Returns
+ * true if the BLUE state was active before but quiescent after this call.
+ */
+static bool cobalt_queue_empty(struct cobalt_vars *vars,
+ struct cobalt_params *p,
+ ktime_t now)
+{
+ bool down = false;
+
+ if (vars->p_drop &&
+ ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
+ if (vars->p_drop < p->p_dec)
+ vars->p_drop = 0;
+ else
+ vars->p_drop -= p->p_dec;
+ vars->blue_timer = now;
+ down = !vars->p_drop;
+ }
+ vars->dropping = false;
+
+ if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) {
+ vars->count--;
+ cobalt_invsqrt(vars);
+ vars->drop_next = cobalt_control(vars->drop_next,
+ p->interval,
+ vars->rec_inv_sqrt);
+ }
+
+ return down;
+}
+
+/* Call this with a freshly dequeued packet for possible congestion marking.
+ * Returns true as an instruction to drop the packet, false for delivery.
+ */
+static bool cobalt_should_drop(struct cobalt_vars *vars,
+ struct cobalt_params *p,
+ ktime_t now,
+ struct sk_buff *skb,
+ u32 bulk_flows)
+{
+ bool next_due, over_target, drop = false;
+ ktime_t schedule;
+ u64 sojourn;
+
+/* The 'schedule' variable records, in its sign, whether 'now' is before or
+ * after 'drop_next'. This allows 'drop_next' to be updated before the next
+ * scheduling decision is actually branched, without destroying that
+ * information. Similarly, the first 'schedule' value calculated is preserved
+ * in the boolean 'next_due'.
+ *
+ * As for 'drop_next', we take advantage of the fact that 'interval' is both
+ * the delay between first exceeding 'target' and the first signalling event,
+ * *and* the scaling factor for the signalling frequency. It's therefore very
+ * natural to use a single mechanism for both purposes, and eliminates a
+ * significant amount of reference Codel's spaghetti code. To help with this,
+ * both the '0' and '1' entries in the invsqrt cache are 0xFFFFFFFF, as close
+ * as possible to 1.0 in fixed-point.
+ */
+
+ sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
+ schedule = ktime_sub(now, vars->drop_next);
+ over_target = sojourn > p->target &&
+ sojourn > p->mtu_time * bulk_flows * 2 &&
+ sojourn > p->mtu_time * 4;
+ next_due = vars->count && ktime_to_ns(schedule) >= 0;
+
+ vars->ecn_marked = false;
+
+ if (over_target) {
+ if (!vars->dropping) {
+ vars->dropping = true;
+ vars->drop_next = cobalt_control(now,
+ p->interval,
+ vars->rec_inv_sqrt);
+ }
+ if (!vars->count)
+ vars->count = 1;
+ } else if (vars->dropping) {
+ vars->dropping = false;
+ }
+
+ if (next_due && vars->dropping) {
+ /* Use ECN mark if possible, otherwise drop */
+ drop = !(vars->ecn_marked = INET_ECN_set_ce(skb));
+
+ vars->count++;
+ if (!vars->count)
+ vars->count--;
+ cobalt_invsqrt(vars);
+ vars->drop_next = cobalt_control(vars->drop_next,
+ p->interval,
+ vars->rec_inv_sqrt);
+ schedule = ktime_sub(now, vars->drop_next);
+ } else {
+ while (next_due) {
+ vars->count--;
+ cobalt_invsqrt(vars);
+ vars->drop_next = cobalt_control(vars->drop_next,
+ p->interval,
+ vars->rec_inv_sqrt);
+ schedule = ktime_sub(now, vars->drop_next);
+ next_due = vars->count && ktime_to_ns(schedule) >= 0;
+ }
+ }
+
+ /* Simple BLUE implementation. Lack of ECN is deliberate. */
+ if (vars->p_drop)
+ drop |= (prandom_u32() < vars->p_drop);
+
+ /* Overload the drop_next field as an activity timeout */
+ if (!vars->count)
+ vars->drop_next = ktime_add_ns(now, p->interval);
+ else if (ktime_to_ns(schedule) > 0 && !drop)
+ vars->drop_next = now;
+
+ return drop;
+}
+
+static void cake_update_flowkeys(struct flow_keys *keys,
+ const struct sk_buff *skb)
+{
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+ struct nf_conntrack_tuple tuple = {};
+ bool rev = !skb->_nfct;
+
+ if (tc_skb_protocol(skb) != htons(ETH_P_IP))
+ return;
+
+ if (!nf_ct_get_tuple_skb(&tuple, skb))
+ return;
+
+ keys->addrs.v4addrs.src = rev ? tuple.dst.u3.ip : tuple.src.u3.ip;
+ keys->addrs.v4addrs.dst = rev ? tuple.src.u3.ip : tuple.dst.u3.ip;
+
+ if (keys->ports.ports) {
+ keys->ports.src = rev ? tuple.dst.u.all : tuple.src.u.all;
+ keys->ports.dst = rev ? tuple.src.u.all : tuple.dst.u.all;
+ }
+#endif
+}
+
+/* Cake has several subtle multiple bit settings. In these cases you
+ * would be matching triple isolate mode as well.
+ */
+
+static bool cake_dsrc(int flow_mode)
+{
+ return (flow_mode & CAKE_FLOW_DUAL_SRC) == CAKE_FLOW_DUAL_SRC;
+}
+
+static bool cake_ddst(int flow_mode)
+{
+ return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST;
+}
+
+static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb,
+ int flow_mode)
+{
+ u32 flow_hash = 0, srchost_hash, dsthost_hash;
+ u16 reduced_hash, srchost_idx, dsthost_idx;
+ struct flow_keys keys, host_keys;
+
+ if (unlikely(flow_mode == CAKE_FLOW_NONE))
+ return 0;
+
+ skb_flow_dissect_flow_keys(skb, &keys,
+ FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
+
+ if (flow_mode & CAKE_FLOW_NAT_FLAG)
+ cake_update_flowkeys(&keys, skb);
+
+ /* flow_hash_from_keys() sorts the addresses by value, so we have
+ * to preserve their order in a separate data structure to treat
+ * src and dst host addresses as independently selectable.
+ */
+ host_keys = keys;
+ host_keys.ports.ports = 0;
+ host_keys.basic.ip_proto = 0;
+ host_keys.keyid.keyid = 0;
+ host_keys.tags.flow_label = 0;
+
+ switch (host_keys.control.addr_type) {
+ case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
+ host_keys.addrs.v4addrs.src = 0;
+ dsthost_hash = flow_hash_from_keys(&host_keys);
+ host_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
+ host_keys.addrs.v4addrs.dst = 0;
+ srchost_hash = flow_hash_from_keys(&host_keys);
+ break;
+
+ case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
+ memset(&host_keys.addrs.v6addrs.src, 0,
+ sizeof(host_keys.addrs.v6addrs.src));
+ dsthost_hash = flow_hash_from_keys(&host_keys);
+ host_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
+ memset(&host_keys.addrs.v6addrs.dst, 0,
+ sizeof(host_keys.addrs.v6addrs.dst));
+ srchost_hash = flow_hash_from_keys(&host_keys);
+ break;
+
+ default:
+ dsthost_hash = 0;
+ srchost_hash = 0;
+ }
+
+ /* This *must* be after the above switch, since as a
+ * side-effect it sorts the src and dst addresses.
+ */
+ if (flow_mode & CAKE_FLOW_FLOWS)
+ flow_hash = flow_hash_from_keys(&keys);
+
+ if (!(flow_mode & CAKE_FLOW_FLOWS)) {
+ if (flow_mode & CAKE_FLOW_SRC_IP)
+ flow_hash ^= srchost_hash;
+
+ if (flow_mode & CAKE_FLOW_DST_IP)
+ flow_hash ^= dsthost_hash;
+ }
+
+ reduced_hash = flow_hash % CAKE_QUEUES;
+
+ /* set-associative hashing */
+ /* fast path if no hash collision (direct lookup succeeds) */
+ if (likely(q->tags[reduced_hash] == flow_hash &&
+ q->flows[reduced_hash].set)) {
+ q->way_directs++;
+ } else {
+ u32 inner_hash = reduced_hash % CAKE_SET_WAYS;
+ u32 outer_hash = reduced_hash - inner_hash;
+ bool allocate_src = false;
+ bool allocate_dst = false;
+ u32 i, k;
+
+ /* check if any active queue in the set is reserved for
+ * this flow.
+ */
+ for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
+ i++, k = (k + 1) % CAKE_SET_WAYS) {
+ if (q->tags[outer_hash + k] == flow_hash) {
+ if (i)
+ q->way_hits++;
+
+ if (!q->flows[outer_hash + k].set) {
+ /* need to increment host refcnts */
+ allocate_src = cake_dsrc(flow_mode);
+ allocate_dst = cake_ddst(flow_mode);
+ }
+
+ goto found;
+ }
+ }
+
+ /* no queue is reserved for this flow, look for an
+ * empty one.
+ */
+ for (i = 0; i < CAKE_SET_WAYS;
+ i++, k = (k + 1) % CAKE_SET_WAYS) {
+ if (!q->flows[outer_hash + k].set) {
+ q->way_misses++;
+ allocate_src = cake_dsrc(flow_mode);
+ allocate_dst = cake_ddst(flow_mode);
+ goto found;
+ }
+ }
+
+ /* With no empty queues, default to the original
+ * queue, accept the collision, update the host tags.
+ */
+ q->way_collisions++;
+ q->hosts[q->flows[reduced_hash].srchost].srchost_refcnt--;
+ q->hosts[q->flows[reduced_hash].dsthost].dsthost_refcnt--;
+ allocate_src = cake_dsrc(flow_mode);
+ allocate_dst = cake_ddst(flow_mode);
+found:
+ /* reserve queue for future packets in same flow */
+ reduced_hash = outer_hash + k;
+ q->tags[reduced_hash] = flow_hash;
+
+ if (allocate_src) {
+ srchost_idx = srchost_hash % CAKE_QUEUES;
+ inner_hash = srchost_idx % CAKE_SET_WAYS;
+ outer_hash = srchost_idx - inner_hash;
+ for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
+ i++, k = (k + 1) % CAKE_SET_WAYS) {
+ if (q->hosts[outer_hash + k].srchost_tag ==
+ srchost_hash)
+ goto found_src;
+ }
+ for (i = 0; i < CAKE_SET_WAYS;
+ i++, k = (k + 1) % CAKE_SET_WAYS) {
+ if (!q->hosts[outer_hash + k].srchost_refcnt)
+ break;
+ }
+ q->hosts[outer_hash + k].srchost_tag = srchost_hash;
+found_src:
+ srchost_idx = outer_hash + k;
+ q->hosts[srchost_idx].srchost_refcnt++;
+ q->flows[reduced_hash].srchost = srchost_idx;
+ }
+
+ if (allocate_dst) {
+ dsthost_idx = dsthost_hash % CAKE_QUEUES;
+ inner_hash = dsthost_idx % CAKE_SET_WAYS;
+ outer_hash = dsthost_idx - inner_hash;
+ for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
+ i++, k = (k + 1) % CAKE_SET_WAYS) {
+ if (q->hosts[outer_hash + k].dsthost_tag ==
+ dsthost_hash)
+ goto found_dst;
+ }
+ for (i = 0; i < CAKE_SET_WAYS;
+ i++, k = (k + 1) % CAKE_SET_WAYS) {
+ if (!q->hosts[outer_hash + k].dsthost_refcnt)
+ break;
+ }
+ q->hosts[outer_hash + k].dsthost_tag = dsthost_hash;
+found_dst:
+ dsthost_idx = outer_hash + k;
+ q->hosts[dsthost_idx].dsthost_refcnt++;
+ q->flows[reduced_hash].dsthost = dsthost_idx;
+ }
+ }
+
+ return reduced_hash;
+}
+
+/* helper functions : might be changed when/if skb use a standard list_head */
+/* remove one skb from head of slot queue */
+
+static struct sk_buff *dequeue_head(struct cake_flow *flow)
+{
+ struct sk_buff *skb = flow->head;
+
+ if (skb) {
+ flow->head = skb->next;
+ skb->next = NULL;
+ }
+
+ return skb;
+}
+
+/* add skb to flow queue (tail add) */
+
+static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb)
+{
+ if (!flow->head)
+ flow->head = skb;
+ else
+ flow->tail->next = skb;
+ flow->tail = skb;
+ skb->next = NULL;
+}
+
+static struct iphdr *cake_get_iphdr(const struct sk_buff *skb,
+ struct ipv6hdr *buf)
+{
+ unsigned int offset = skb_network_offset(skb);
+ struct iphdr *iph;
+
+ iph = skb_header_pointer(skb, offset, sizeof(struct iphdr), buf);
+
+ if (!iph)
+ return NULL;
+
+ if (iph->version == 4 && iph->protocol == IPPROTO_IPV6)
+ return skb_header_pointer(skb, offset + iph->ihl * 4,
+ sizeof(struct ipv6hdr), buf);
+
+ else if (iph->version == 4)
+ return iph;
+
+ else if (iph->version == 6)
+ return skb_header_pointer(skb, offset, sizeof(struct ipv6hdr),
+ buf);
+
+ return NULL;
+}
+
+static struct tcphdr *cake_get_tcphdr(const struct sk_buff *skb,
+ void *buf, unsigned int bufsize)
+{
+ unsigned int offset = skb_network_offset(skb);
+ const struct ipv6hdr *ipv6h;
+ const struct tcphdr *tcph;
+ const struct iphdr *iph;
+ struct ipv6hdr _ipv6h;
+ struct tcphdr _tcph;
+
+ ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h);
+
+ if (!ipv6h)
+ return NULL;
+
+ if (ipv6h->version == 4) {
+ iph = (struct iphdr *)ipv6h;
+ offset += iph->ihl * 4;
+
+ /* special-case 6in4 tunnelling, as that is a common way to get
+ * v6 connectivity in the home
+ */
+ if (iph->protocol == IPPROTO_IPV6) {
+ ipv6h = skb_header_pointer(skb, offset,
+ sizeof(_ipv6h), &_ipv6h);
+
+ if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP)
+ return NULL;
+
+ offset += sizeof(struct ipv6hdr);
+
+ } else if (iph->protocol != IPPROTO_TCP) {
+ return NULL;
+ }
+
+ } else if (ipv6h->version == 6) {
+ if (ipv6h->nexthdr != IPPROTO_TCP)
+ return NULL;
+
+ offset += sizeof(struct ipv6hdr);
+ } else {
+ return NULL;
+ }
+
+ tcph = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
+ if (!tcph)
+ return NULL;
+
+ return skb_header_pointer(skb, offset,
+ min(__tcp_hdrlen(tcph), bufsize), buf);
+}
+
+static const void *cake_get_tcpopt(const struct tcphdr *tcph,
+ int code, int *oplen)
+{
+ /* inspired by tcp_parse_options in tcp_input.c */
+ int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
+ const u8 *ptr = (const u8 *)(tcph + 1);
+
+ while (length > 0) {
+ int opcode = *ptr++;
+ int opsize;
+
+ if (opcode == TCPOPT_EOL)
+ break;
+ if (opcode == TCPOPT_NOP) {
+ length--;
+ continue;
+ }
+ opsize = *ptr++;
+ if (opsize < 2 || opsize > length)
+ break;
+
+ if (opcode == code) {
+ *oplen = opsize;
+ return ptr;
+ }
+
+ ptr += opsize - 2;
+ length -= opsize;
+ }
+
+ return NULL;
+}
+
+/* Compare two SACK sequences. A sequence is considered greater if it SACKs more
+ * bytes than the other. In the case where both sequences ACKs bytes that the
+ * other doesn't, A is considered greater. DSACKs in A also makes A be
+ * considered greater.
+ *
+ * @return -1, 0 or 1 as normal compare functions
+ */
+static int cake_tcph_sack_compare(const struct tcphdr *tcph_a,
+ const struct tcphdr *tcph_b)
+{
+ const struct tcp_sack_block_wire *sack_a, *sack_b;
+ u32 ack_seq_a = ntohl(tcph_a->ack_seq);
+ u32 bytes_a = 0, bytes_b = 0;
+ int oplen_a, oplen_b;
+ bool first = true;
+
+ sack_a = cake_get_tcpopt(tcph_a, TCPOPT_SACK, &oplen_a);
+ sack_b = cake_get_tcpopt(tcph_b, TCPOPT_SACK, &oplen_b);
+
+ /* pointers point to option contents */
+ oplen_a -= TCPOLEN_SACK_BASE;
+ oplen_b -= TCPOLEN_SACK_BASE;
+
+ if (sack_a && oplen_a >= sizeof(*sack_a) &&
+ (!sack_b || oplen_b < sizeof(*sack_b)))
+ return -1;
+ else if (sack_b && oplen_b >= sizeof(*sack_b) &&
+ (!sack_a || oplen_a < sizeof(*sack_a)))
+ return 1;
+ else if ((!sack_a || oplen_a < sizeof(*sack_a)) &&
+ (!sack_b || oplen_b < sizeof(*sack_b)))
+ return 0;
+
+ while (oplen_a >= sizeof(*sack_a)) {
+ const struct tcp_sack_block_wire *sack_tmp = sack_b;
+ u32 start_a = get_unaligned_be32(&sack_a->start_seq);
+ u32 end_a = get_unaligned_be32(&sack_a->end_seq);
+ int oplen_tmp = oplen_b;
+ bool found = false;
+
+ /* DSACK; always considered greater to prevent dropping */
+ if (before(start_a, ack_seq_a))
+ return -1;
+
+ bytes_a += end_a - start_a;
+
+ while (oplen_tmp >= sizeof(*sack_tmp)) {
+ u32 start_b = get_unaligned_be32(&sack_tmp->start_seq);
+ u32 end_b = get_unaligned_be32(&sack_tmp->end_seq);
+
+ /* first time through we count the total size */
+ if (first)
+ bytes_b += end_b - start_b;
+
+ if (!after(start_b, start_a) && !before(end_b, end_a)) {
+ found = true;
+ if (!first)
+ break;
+ }
+ oplen_tmp -= sizeof(*sack_tmp);
+ sack_tmp++;
+ }
+
+ if (!found)
+ return -1;
+
+ oplen_a -= sizeof(*sack_a);
+ sack_a++;
+ first = false;
+ }
+
+ /* If we made it this far, all ranges SACKed by A are covered by B, so
+ * either the SACKs are equal, or B SACKs more bytes.
+ */
+ return bytes_b > bytes_a ? 1 : 0;
+}
+
+static void cake_tcph_get_tstamp(const struct tcphdr *tcph,
+ u32 *tsval, u32 *tsecr)
+{
+ const u8 *ptr;
+ int opsize;
+
+ ptr = cake_get_tcpopt(tcph, TCPOPT_TIMESTAMP, &opsize);
+
+ if (ptr && opsize == TCPOLEN_TIMESTAMP) {
+ *tsval = get_unaligned_be32(ptr);
+ *tsecr = get_unaligned_be32(ptr + 4);
+ }
+}
+
+static bool cake_tcph_may_drop(const struct tcphdr *tcph,
+ u32 tstamp_new, u32 tsecr_new)
+{
+ /* inspired by tcp_parse_options in tcp_input.c */
+ int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
+ const u8 *ptr = (const u8 *)(tcph + 1);
+ u32 tstamp, tsecr;
+
+ /* 3 reserved flags must be unset to avoid future breakage
+ * ACK must be set
+ * ECE/CWR are handled separately
+ * All other flags URG/PSH/RST/SYN/FIN must be unset
+ * 0x0FFF0000 = all TCP flags (confirm ACK=1, others zero)
+ * 0x00C00000 = CWR/ECE (handled separately)
+ * 0x0F3F0000 = 0x0FFF0000 & ~0x00C00000
+ */
+ if (((tcp_flag_word(tcph) &
+ cpu_to_be32(0x0F3F0000)) != TCP_FLAG_ACK))
+ return false;
+
+ while (length > 0) {
+ int opcode = *ptr++;
+ int opsize;
+
+ if (opcode == TCPOPT_EOL)
+ break;
+ if (opcode == TCPOPT_NOP) {
+ length--;
+ continue;
+ }
+ opsize = *ptr++;
+ if (opsize < 2 || opsize > length)
+ break;
+
+ switch (opcode) {
+ case TCPOPT_MD5SIG: /* doesn't influence state */
+ break;
+
+ case TCPOPT_SACK: /* stricter checking performed later */
+ if (opsize % 8 != 2)
+ return false;
+ break;
+
+ case TCPOPT_TIMESTAMP:
+ /* only drop timestamps lower than new */
+ if (opsize != TCPOLEN_TIMESTAMP)
+ return false;
+ tstamp = get_unaligned_be32(ptr);
+ tsecr = get_unaligned_be32(ptr + 4);
+ if (after(tstamp, tstamp_new) ||
+ after(tsecr, tsecr_new))
+ return false;
+ break;
+
+ case TCPOPT_MSS: /* these should only be set on SYN */
+ case TCPOPT_WINDOW:
+ case TCPOPT_SACK_PERM:
+ case TCPOPT_FASTOPEN:
+ case TCPOPT_EXP:
+ default: /* don't drop if any unknown options are present */
+ return false;
+ }
+
+ ptr += opsize - 2;
+ length -= opsize;
+ }
+
+ return true;
+}
+
+static struct sk_buff *cake_ack_filter(struct cake_sched_data *q,
+ struct cake_flow *flow)
+{
+ bool aggressive = q->ack_filter == CAKE_ACK_AGGRESSIVE;
+ struct sk_buff *elig_ack = NULL, *elig_ack_prev = NULL;
+ struct sk_buff *skb_check, *skb_prev = NULL;
+ const struct ipv6hdr *ipv6h, *ipv6h_check;
+ unsigned char _tcph[64], _tcph_check[64];
+ const struct tcphdr *tcph, *tcph_check;
+ const struct iphdr *iph, *iph_check;
+ struct ipv6hdr _iph, _iph_check;
+ const struct sk_buff *skb;
+ int seglen, num_found = 0;
+ u32 tstamp = 0, tsecr = 0;
+ __be32 elig_flags = 0;
+ int sack_comp;
+
+ /* no other possible ACKs to filter */
+ if (flow->head == flow->tail)
+ return NULL;
+
+ skb = flow->tail;
+ tcph = cake_get_tcphdr(skb, _tcph, sizeof(_tcph));
+ iph = cake_get_iphdr(skb, &_iph);
+ if (!tcph)
+ return NULL;
+
+ cake_tcph_get_tstamp(tcph, &tstamp, &tsecr);
+
+ /* the 'triggering' packet need only have the ACK flag set.
+ * also check that SYN is not set, as there won't be any previous ACKs.
+ */
+ if ((tcp_flag_word(tcph) &
+ (TCP_FLAG_ACK | TCP_FLAG_SYN)) != TCP_FLAG_ACK)
+ return NULL;
+
+ /* the 'triggering' ACK is at the tail of the queue, we have already
+ * returned if it is the only packet in the flow. loop through the rest
+ * of the queue looking for pure ACKs with the same 5-tuple as the
+ * triggering one.
+ */
+ for (skb_check = flow->head;
+ skb_check && skb_check != skb;
+ skb_prev = skb_check, skb_check = skb_check->next) {
+ iph_check = cake_get_iphdr(skb_check, &_iph_check);
+ tcph_check = cake_get_tcphdr(skb_check, &_tcph_check,
+ sizeof(_tcph_check));
+
+ /* only TCP packets with matching 5-tuple are eligible, and only
+ * drop safe headers
+ */
+ if (!tcph_check || iph->version != iph_check->version ||
+ tcph_check->source != tcph->source ||
+ tcph_check->dest != tcph->dest)
+ continue;
+
+ if (iph_check->version == 4) {
+ if (iph_check->saddr != iph->saddr ||
+ iph_check->daddr != iph->daddr)
+ continue;
+
+ seglen = ntohs(iph_check->tot_len) -
+ (4 * iph_check->ihl);
+ } else if (iph_check->version == 6) {
+ ipv6h = (struct ipv6hdr *)iph;
+ ipv6h_check = (struct ipv6hdr *)iph_check;
+
+ if (ipv6_addr_cmp(&ipv6h_check->saddr, &ipv6h->saddr) ||
+ ipv6_addr_cmp(&ipv6h_check->daddr, &ipv6h->daddr))
+ continue;
+
+ seglen = ntohs(ipv6h_check->payload_len);
+ } else {
+ WARN_ON(1); /* shouldn't happen */
+ continue;
+ }
+
+ /* If the ECE/CWR flags changed from the previous eligible
+ * packet in the same flow, we should no longer be dropping that
+ * previous packet as this would lose information.
+ */
+ if (elig_ack && (tcp_flag_word(tcph_check) &
+ (TCP_FLAG_ECE | TCP_FLAG_CWR)) != elig_flags) {
+ elig_ack = NULL;
+ elig_ack_prev = NULL;
+ num_found--;
+ }
+
+ /* Check TCP options and flags, don't drop ACKs with segment
+ * data, and don't drop ACKs with a higher cumulative ACK
+ * counter than the triggering packet. Check ACK seqno here to
+ * avoid parsing SACK options of packets we are going to exclude
+ * anyway.
+ */
+ if (!cake_tcph_may_drop(tcph_check, tstamp, tsecr) ||
+ (seglen - __tcp_hdrlen(tcph_check)) != 0 ||
+ after(ntohl(tcph_check->ack_seq), ntohl(tcph->ack_seq)))
+ continue;
+
+ /* Check SACK options. The triggering packet must SACK more data
+ * than the ACK under consideration, or SACK the same range but
+ * have a larger cumulative ACK counter. The latter is a
+ * pathological case, but is contained in the following check
+ * anyway, just to be safe.
+ */
+ sack_comp = cake_tcph_sack_compare(tcph_check, tcph);
+
+ if (sack_comp < 0 ||
+ (ntohl(tcph_check->ack_seq) == ntohl(tcph->ack_seq) &&
+ sack_comp == 0))
+ continue;
+
+ /* At this point we have found an eligible pure ACK to drop; if
+ * we are in aggressive mode, we are done. Otherwise, keep
+ * searching unless this is the second eligible ACK we
+ * found.
+ *
+ * Since we want to drop ACK closest to the head of the queue,
+ * save the first eligible ACK we find, even if we need to loop
+ * again.
+ */
+ if (!elig_ack) {
+ elig_ack = skb_check;
+ elig_ack_prev = skb_prev;
+ elig_flags = (tcp_flag_word(tcph_check)
+ & (TCP_FLAG_ECE | TCP_FLAG_CWR));
+ }
+
+ if (num_found++ > 0)
+ goto found;
+ }
+
+ /* We made it through the queue without finding two eligible ACKs . If
+ * we found a single eligible ACK we can drop it in aggressive mode if
+ * we can guarantee that this does not interfere with ECN flag
+ * information. We ensure this by dropping it only if the enqueued
+ * packet is consecutive with the eligible ACK, and their flags match.
+ */
+ if (elig_ack && aggressive && elig_ack->next == skb &&
+ (elig_flags == (tcp_flag_word(tcph) &
+ (TCP_FLAG_ECE | TCP_FLAG_CWR))))
+ goto found;
+
+ return NULL;
+
+found:
+ if (elig_ack_prev)
+ elig_ack_prev->next = elig_ack->next;
+ else
+ flow->head = elig_ack->next;
+
+ elig_ack->next = NULL;
+
+ return elig_ack;
+}
+
+static u64 cake_ewma(u64 avg, u64 sample, u32 shift)
+{
+ avg -= avg >> shift;
+ avg += sample >> shift;
+ return avg;
+}
+
+static u32 cake_calc_overhead(struct cake_sched_data *q, u32 len, u32 off)
+{
+ if (q->rate_flags & CAKE_FLAG_OVERHEAD)
+ len -= off;
+
+ if (q->max_netlen < len)
+ q->max_netlen = len;
+ if (q->min_netlen > len)
+ q->min_netlen = len;
+
+ len += q->rate_overhead;
+
+ if (len < q->rate_mpu)
+ len = q->rate_mpu;
+
+ if (q->atm_mode == CAKE_ATM_ATM) {
+ len += 47;
+ len /= 48;
+ len *= 53;
+ } else if (q->atm_mode == CAKE_ATM_PTM) {
+ /* Add one byte per 64 bytes or part thereof.
+ * This is conservative and easier to calculate than the
+ * precise value.
+ */
+ len += (len + 63) / 64;
+ }
+
+ if (q->max_adjlen < len)
+ q->max_adjlen = len;
+ if (q->min_adjlen > len)
+ q->min_adjlen = len;
+
+ return len;
+}
+
+static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb)
+{
+ const struct skb_shared_info *shinfo = skb_shinfo(skb);
+ unsigned int hdr_len, last_len = 0;
+ u32 off = skb_network_offset(skb);
+ u32 len = qdisc_pkt_len(skb);
+ u16 segs = 1;
+
+ q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8);
+
+ if (!shinfo->gso_size)
+ return cake_calc_overhead(q, len, off);
+
+ /* borrowed from qdisc_pkt_len_init() */
+ hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
+
+ /* + transport layer */
+ if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 |
+ SKB_GSO_TCPV6))) {
+ const struct tcphdr *th;
+ struct tcphdr _tcphdr;
+
+ th = skb_header_pointer(skb, skb_transport_offset(skb),
+ sizeof(_tcphdr), &_tcphdr);
+ if (likely(th))
+ hdr_len += __tcp_hdrlen(th);
+ } else {
+ struct udphdr _udphdr;
+
+ if (skb_header_pointer(skb, skb_transport_offset(skb),
+ sizeof(_udphdr), &_udphdr))
+ hdr_len += sizeof(struct udphdr);
+ }
+
+ if (unlikely(shinfo->gso_type & SKB_GSO_DODGY))
+ segs = DIV_ROUND_UP(skb->len - hdr_len,
+ shinfo->gso_size);
+ else
+ segs = shinfo->gso_segs;
+
+ len = shinfo->gso_size + hdr_len;
+ last_len = skb->len - shinfo->gso_size * (segs - 1);
+
+ return (cake_calc_overhead(q, len, off) * (segs - 1) +
+ cake_calc_overhead(q, last_len, off));
+}
+
+static void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j)
+{
+ struct cake_heap_entry ii = q->overflow_heap[i];
+ struct cake_heap_entry jj = q->overflow_heap[j];
+
+ q->overflow_heap[i] = jj;
+ q->overflow_heap[j] = ii;
+
+ q->tins[ii.t].overflow_idx[ii.b] = j;
+ q->tins[jj.t].overflow_idx[jj.b] = i;
+}
+
+static u32 cake_heap_get_backlog(const struct cake_sched_data *q, u16 i)
+{
+ struct cake_heap_entry ii = q->overflow_heap[i];
+
+ return q->tins[ii.t].backlogs[ii.b];
+}
+
+static void cake_heapify(struct cake_sched_data *q, u16 i)
+{
+ static const u32 a = CAKE_MAX_TINS * CAKE_QUEUES;
+ u32 mb = cake_heap_get_backlog(q, i);
+ u32 m = i;
+
+ while (m < a) {
+ u32 l = m + m + 1;
+ u32 r = l + 1;
+
+ if (l < a) {
+ u32 lb = cake_heap_get_backlog(q, l);
+
+ if (lb > mb) {
+ m = l;
+ mb = lb;
+ }
+ }
+
+ if (r < a) {
+ u32 rb = cake_heap_get_backlog(q, r);
+
+ if (rb > mb) {
+ m = r;
+ mb = rb;
+ }
+ }
+
+ if (m != i) {
+ cake_heap_swap(q, i, m);
+ i = m;
+ } else {
+ break;
+ }
+ }
+}
+
+static void cake_heapify_up(struct cake_sched_data *q, u16 i)
+{
+ while (i > 0 && i < CAKE_MAX_TINS * CAKE_QUEUES) {
+ u16 p = (i - 1) >> 1;
+ u32 ib = cake_heap_get_backlog(q, i);
+ u32 pb = cake_heap_get_backlog(q, p);
+
+ if (ib > pb) {
+ cake_heap_swap(q, i, p);
+ i = p;
+ } else {
+ break;
+ }
+ }
+}
+
+static int cake_advance_shaper(struct cake_sched_data *q,
+ struct cake_tin_data *b,
+ struct sk_buff *skb,
+ ktime_t now, bool drop)
+{
+ u32 len = get_cobalt_cb(skb)->adjusted_len;
+
+ /* charge packet bandwidth to this tin
+ * and to the global shaper.
+ */
+ if (q->rate_ns) {
+ u64 tin_dur = (len * b->tin_rate_ns) >> b->tin_rate_shft;
+ u64 global_dur = (len * q->rate_ns) >> q->rate_shft;
+ u64 failsafe_dur = global_dur + (global_dur >> 1);
+
+ if (ktime_before(b->time_next_packet, now))
+ b->time_next_packet = ktime_add_ns(b->time_next_packet,
+ tin_dur);
+
+ else if (ktime_before(b->time_next_packet,
+ ktime_add_ns(now, tin_dur)))
+ b->time_next_packet = ktime_add_ns(now, tin_dur);
+
+ q->time_next_packet = ktime_add_ns(q->time_next_packet,
+ global_dur);
+ if (!drop)
+ q->failsafe_next_packet = \
+ ktime_add_ns(q->failsafe_next_packet,
+ failsafe_dur);
+ }
+ return len;
+}
+
+static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free)
+{
+ struct cake_sched_data *q = qdisc_priv(sch);
+ ktime_t now = ktime_get();
+ u32 idx = 0, tin = 0, len;
+ struct cake_heap_entry qq;
+ struct cake_tin_data *b;
+ struct cake_flow *flow;
+ struct sk_buff *skb;
+
+ if (!q->overflow_timeout) {
+ int i;
+ /* Build fresh max-heap */
+ for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2; i >= 0; i--)
+ cake_heapify(q, i);
+ }
+ q->overflow_timeout = 65535;
+
+ /* select longest queue for pruning */
+ qq = q->overflow_heap[0];
+ tin = qq.t;
+ idx = qq.b;
+
+ b = &q->tins[tin];
+ flow = &b->flows[idx];
+ skb = dequeue_head(flow);
+ if (unlikely(!skb)) {
+ /* heap has gone wrong, rebuild it next time */
+ q->overflow_timeout = 0;
+ return idx + (tin << 16);
+ }
+
+ if (cobalt_queue_full(&flow->cvars, &b->cparams, now))
+ b->unresponsive_flow_count++;
+
+ len = qdisc_pkt_len(skb);
+ q->buffer_used -= skb->truesize;
+ b->backlogs[idx] -= len;
+ b->tin_backlog -= len;
+ sch->qstats.backlog -= len;
+ qdisc_tree_reduce_backlog(sch, 1, len);
+
+ flow->dropped++;
+ b->tin_dropped++;
+ sch->qstats.drops++;
+
+ if (q->rate_flags & CAKE_FLAG_INGRESS)
+ cake_advance_shaper(q, b, skb, now, true);
+
+ __qdisc_drop(skb, to_free);
+ sch->q.qlen--;
+
+ cake_heapify(q, 0);
+
+ return idx + (tin << 16);
+}
+
+static void cake_wash_diffserv(struct sk_buff *skb)
+{
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0);
+ break;
+ case htons(ETH_P_IPV6):
+ ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0);
+ break;
+ default:
+ break;
+ }
+}
+
+static u8 cake_handle_diffserv(struct sk_buff *skb, u16 wash)
+{
+ u8 dscp;
+
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ dscp = ipv4_get_dsfield(ip_hdr(skb)) >> 2;
+ if (wash && dscp)
+ ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0);
+ return dscp;
+
+ case htons(ETH_P_IPV6):
+ dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> 2;
+ if (wash && dscp)
+ ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0);
+ return dscp;
+
+ case htons(ETH_P_ARP):
+ return 0x38; /* CS7 - Net Control */
+
+ default:
+ /* If there is no Diffserv field, treat as best-effort */
+ return 0;
+ }
+}
+
+static struct cake_tin_data *cake_select_tin(struct Qdisc *sch,
+ struct sk_buff *skb)
+{
+ struct cake_sched_data *q = qdisc_priv(sch);
+ u32 tin;
+
+ if (TC_H_MAJ(skb->priority) == sch->handle &&
+ TC_H_MIN(skb->priority) > 0 &&
+ TC_H_MIN(skb->priority) <= q->tin_cnt) {
+ tin = q->tin_order[TC_H_MIN(skb->priority) - 1];
+
+ if (q->rate_flags & CAKE_FLAG_WASH)
+ cake_wash_diffserv(skb);
+ } else if (q->tin_mode != CAKE_DIFFSERV_BESTEFFORT) {
+ /* extract the Diffserv Precedence field, if it exists */
+ /* and clear DSCP bits if washing */
+ tin = q->tin_index[cake_handle_diffserv(skb,
+ q->rate_flags & CAKE_FLAG_WASH)];
+ if (unlikely(tin >= q->tin_cnt))
+ tin = 0;
+ } else {
+ tin = 0;
+ if (q->rate_flags & CAKE_FLAG_WASH)
+ cake_wash_diffserv(skb);
+ }
+
+ return &q->tins[tin];
+}
+
+static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t,
+ struct sk_buff *skb, int flow_mode, int *qerr)
+{
+ struct cake_sched_data *q = qdisc_priv(sch);
+ struct tcf_proto *filter;
+ struct tcf_result res;
+ u32 flow = 0;
+ int result;
+
+ filter = rcu_dereference_bh(q->filter_list);
+ if (!filter)
+ goto hash;
+
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+ result = tcf_classify(skb, filter, &res, false);
+
+ if (result >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+ switch (result) {
+ case TC_ACT_STOLEN:
+ case TC_ACT_QUEUED:
+ case TC_ACT_TRAP:
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+ /* fall through */
+ case TC_ACT_SHOT:
+ return 0;
+ }
+#endif
+ if (TC_H_MIN(res.classid) <= CAKE_QUEUES)
+ flow = TC_H_MIN(res.classid);
+ }
+hash:
+ *t = cake_select_tin(sch, skb);
+ return flow ?: cake_hash(*t, skb, flow_mode) + 1;
+}
+
+static void cake_reconfigure(struct Qdisc *sch);
+
+static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
+{
+ struct cake_sched_data *q = qdisc_priv(sch);
+ int len = qdisc_pkt_len(skb);
+ int uninitialized_var(ret);
+ struct sk_buff *ack = NULL;
+ ktime_t now = ktime_get();
+ struct cake_tin_data *b;
+ struct cake_flow *flow;
+ u32 idx;
+
+ /* choose flow to insert into */
+ idx = cake_classify(sch, &b, skb, q->flow_mode, &ret);
+ if (idx == 0) {
+ if (ret & __NET_XMIT_BYPASS)
+ qdisc_qstats_drop(sch);
+ __qdisc_drop(skb, to_free);
+ return ret;
+ }
+ idx--;
+ flow = &b->flows[idx];
+
+ /* ensure shaper state isn't stale */
+ if (!b->tin_backlog) {
+ if (ktime_before(b->time_next_packet, now))
+ b->time_next_packet = now;
+
+ if (!sch->q.qlen) {
+ if (ktime_before(q->time_next_packet, now)) {
+ q->failsafe_next_packet = now;
+ q->time_next_packet = now;
+ } else if (ktime_after(q->time_next_packet, now) &&
+ ktime_after(q->failsafe_next_packet, now)) {
+ u64 next = \
+ min(ktime_to_ns(q->time_next_packet),
+ ktime_to_ns(
+ q->failsafe_next_packet));
+ sch->qstats.overlimits++;
+ qdisc_watchdog_schedule_ns(&q->watchdog, next);
+ }
+ }
+ }
+
+ if (unlikely(len > b->max_skblen))
+ b->max_skblen = len;
+
+ if (skb_is_gso(skb) && q->rate_flags & CAKE_FLAG_SPLIT_GSO) {
+ struct sk_buff *segs, *nskb;
+ netdev_features_t features = netif_skb_features(skb);
+ unsigned int slen = 0;
+
+ segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
+ if (IS_ERR_OR_NULL(segs))
+ return qdisc_drop(skb, sch, to_free);
+
+ while (segs) {
+ nskb = segs->next;
+ segs->next = NULL;
+ qdisc_skb_cb(segs)->pkt_len = segs->len;
+ cobalt_set_enqueue_time(segs, now);
+ get_cobalt_cb(segs)->adjusted_len = cake_overhead(q,
+ segs);
+ flow_queue_add(flow, segs);
+
+ sch->q.qlen++;
+ slen += segs->len;
+ q->buffer_used += segs->truesize;
+ b->packets++;
+ segs = nskb;
+ }
+
+ /* stats */
+ b->bytes += slen;
+ b->backlogs[idx] += slen;
+ b->tin_backlog += slen;
+ sch->qstats.backlog += slen;
+ q->avg_window_bytes += slen;
+
+ qdisc_tree_reduce_backlog(sch, 1, len);
+ consume_skb(skb);
+ } else {
+ /* not splitting */
+ cobalt_set_enqueue_time(skb, now);
+ get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb);
+ flow_queue_add(flow, skb);
+
+ if (q->ack_filter)
+ ack = cake_ack_filter(q, flow);
+
+ if (ack) {
+ b->ack_drops++;
+ sch->qstats.drops++;
+ b->bytes += qdisc_pkt_len(ack);
+ len -= qdisc_pkt_len(ack);
+ q->buffer_used += skb->truesize - ack->truesize;
+ if (q->rate_flags & CAKE_FLAG_INGRESS)
+ cake_advance_shaper(q, b, ack, now, true);
+
+ qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(ack));
+ consume_skb(ack);
+ } else {
+ sch->q.qlen++;
+ q->buffer_used += skb->truesize;
+ }
+
+ /* stats */
+ b->packets++;
+ b->bytes += len;
+ b->backlogs[idx] += len;
+ b->tin_backlog += len;
+ sch->qstats.backlog += len;
+ q->avg_window_bytes += len;
+ }
+
+ if (q->overflow_timeout)
+ cake_heapify_up(q, b->overflow_idx[idx]);
+
+ /* incoming bandwidth capacity estimate */
+ if (q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) {
+ u64 packet_interval = \
+ ktime_to_ns(ktime_sub(now, q->last_packet_time));
+
+ if (packet_interval > NSEC_PER_SEC)
+ packet_interval = NSEC_PER_SEC;
+
+ /* filter out short-term bursts, eg. wifi aggregation */
+ q->avg_packet_interval = \
+ cake_ewma(q->avg_packet_interval,
+ packet_interval,
+ (packet_interval > q->avg_packet_interval ?
+ 2 : 8));
+
+ q->last_packet_time = now;
+
+ if (packet_interval > q->avg_packet_interval) {
+ u64 window_interval = \
+ ktime_to_ns(ktime_sub(now,
+ q->avg_window_begin));
+ u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC;
+
+ do_div(b, window_interval);
+ q->avg_peak_bandwidth =
+ cake_ewma(q->avg_peak_bandwidth, b,
+ b > q->avg_peak_bandwidth ? 2 : 8);
+ q->avg_window_bytes = 0;
+ q->avg_window_begin = now;
+
+ if (ktime_after(now,
+ ktime_add_ms(q->last_reconfig_time,
+ 250))) {
+ q->rate_bps = (q->avg_peak_bandwidth * 15) >> 4;
+ cake_reconfigure(sch);
+ }
+ }
+ } else {
+ q->avg_window_bytes = 0;
+ q->last_packet_time = now;
+ }
+
+ /* flowchain */
+ if (!flow->set || flow->set == CAKE_SET_DECAYING) {
+ struct cake_host *srchost = &b->hosts[flow->srchost];
+ struct cake_host *dsthost = &b->hosts[flow->dsthost];
+ u16 host_load = 1;
+
+ if (!flow->set) {
+ list_add_tail(&flow->flowchain, &b->new_flows);
+ } else {
+ b->decaying_flow_count--;
+ list_move_tail(&flow->flowchain, &b->new_flows);
+ }
+ flow->set = CAKE_SET_SPARSE;
+ b->sparse_flow_count++;
+
+ if (cake_dsrc(q->flow_mode))
+ host_load = max(host_load, srchost->srchost_refcnt);
+
+ if (cake_ddst(q->flow_mode))
+ host_load = max(host_load, dsthost->dsthost_refcnt);
+
+ flow->deficit = (b->flow_quantum *
+ quantum_div[host_load]) >> 16;
+ } else if (flow->set == CAKE_SET_SPARSE_WAIT) {
+ /* this flow was empty, accounted as a sparse flow, but actually
+ * in the bulk rotation.
+ */
+ flow->set = CAKE_SET_BULK;
+ b->sparse_flow_count--;
+ b->bulk_flow_count++;
+ }
+
+ if (q->buffer_used > q->buffer_max_used)
+ q->buffer_max_used = q->buffer_used;
+
+ if (q->buffer_used > q->buffer_limit) {
+ u32 dropped = 0;
+
+ while (q->buffer_used > q->buffer_limit) {
+ dropped++;
+ cake_drop(sch, to_free);
+ }
+ b->drop_overlimit += dropped;
+ }
+ return NET_XMIT_SUCCESS;
+}
+
+static struct sk_buff *cake_dequeue_one(struct Qdisc *sch)
+{
+ struct cake_sched_data *q = qdisc_priv(sch);
+ struct cake_tin_data *b = &q->tins[q->cur_tin];
+ struct cake_flow *flow = &b->flows[q->cur_flow];
+ struct sk_buff *skb = NULL;
+ u32 len;
+
+ if (flow->head) {
+ skb = dequeue_head(flow);
+ len = qdisc_pkt_len(skb);
+ b->backlogs[q->cur_flow] -= len;
+ b->tin_backlog -= len;
+ sch->qstats.backlog -= len;
+ q->buffer_used -= skb->truesize;
+ sch->q.qlen--;
+
+ if (q->overflow_timeout)
+ cake_heapify(q, b->overflow_idx[q->cur_flow]);
+ }
+ return skb;
+}
+
+/* Discard leftover packets from a tin no longer in use. */
+static void cake_clear_tin(struct Qdisc *sch, u16 tin)
+{
+ struct cake_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *skb;
+
+ q->cur_tin = tin;
+ for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++)
+ while (!!(skb = cake_dequeue_one(sch)))
+ kfree_skb(skb);
+}
+
+static struct sk_buff *cake_dequeue(struct Qdisc *sch)
+{
+ struct cake_sched_data *q = qdisc_priv(sch);
+ struct cake_tin_data *b = &q->tins[q->cur_tin];
+ struct cake_host *srchost, *dsthost;
+ ktime_t now = ktime_get();
+ struct cake_flow *flow;
+ struct list_head *head;
+ bool first_flow = true;
+ struct sk_buff *skb;
+ u16 host_load;
+ u64 delay;
+ u32 len;
+
+begin:
+ if (!sch->q.qlen)
+ return NULL;
+
+ /* global hard shaper */
+ if (ktime_after(q->time_next_packet, now) &&
+ ktime_after(q->failsafe_next_packet, now)) {
+ u64 next = min(ktime_to_ns(q->time_next_packet),
+ ktime_to_ns(q->failsafe_next_packet));
+
+ sch->qstats.overlimits++;
+ qdisc_watchdog_schedule_ns(&q->watchdog, next);
+ return NULL;
+ }
+
+ /* Choose a class to work on. */
+ if (!q->rate_ns) {
+ /* In unlimited mode, can't rely on shaper timings, just balance
+ * with DRR
+ */
+ bool wrapped = false, empty = true;
+
+ while (b->tin_deficit < 0 ||
+ !(b->sparse_flow_count + b->bulk_flow_count)) {
+ if (b->tin_deficit <= 0)
+ b->tin_deficit += b->tin_quantum_band;
+ if (b->sparse_flow_count + b->bulk_flow_count)
+ empty = false;
+
+ q->cur_tin++;
+ b++;
+ if (q->cur_tin >= q->tin_cnt) {
+ q->cur_tin = 0;
+ b = q->tins;
+
+ if (wrapped) {
+ /* It's possible for q->qlen to be
+ * nonzero when we actually have no
+ * packets anywhere.
+ */
+ if (empty)
+ return NULL;
+ } else {
+ wrapped = true;
+ }
+ }
+ }
+ } else {
+ /* In shaped mode, choose:
+ * - Highest-priority tin with queue and meeting schedule, or
+ * - The earliest-scheduled tin with queue.
+ */
+ ktime_t best_time = KTIME_MAX;
+ int tin, best_tin = 0;
+
+ for (tin = 0; tin < q->tin_cnt; tin++) {
+ b = q->tins + tin;
+ if ((b->sparse_flow_count + b->bulk_flow_count) > 0) {
+ ktime_t time_to_pkt = \
+ ktime_sub(b->time_next_packet, now);
+
+ if (ktime_to_ns(time_to_pkt) <= 0 ||
+ ktime_compare(time_to_pkt,
+ best_time) <= 0) {
+ best_time = time_to_pkt;
+ best_tin = tin;
+ }
+ }
+ }
+
+ q->cur_tin = best_tin;
+ b = q->tins + best_tin;
+
+ /* No point in going further if no packets to deliver. */
+ if (unlikely(!(b->sparse_flow_count + b->bulk_flow_count)))
+ return NULL;
+ }
+
+retry:
+ /* service this class */
+ head = &b->decaying_flows;
+ if (!first_flow || list_empty(head)) {
+ head = &b->new_flows;
+ if (list_empty(head)) {
+ head = &b->old_flows;
+ if (unlikely(list_empty(head))) {
+ head = &b->decaying_flows;
+ if (unlikely(list_empty(head)))
+ goto begin;
+ }
+ }
+ }
+ flow = list_first_entry(head, struct cake_flow, flowchain);
+ q->cur_flow = flow - b->flows;
+ first_flow = false;
+
+ /* triple isolation (modified DRR++) */
+ srchost = &b->hosts[flow->srchost];
+ dsthost = &b->hosts[flow->dsthost];
+ host_load = 1;
+
+ if (cake_dsrc(q->flow_mode))
+ host_load = max(host_load, srchost->srchost_refcnt);
+
+ if (cake_ddst(q->flow_mode))
+ host_load = max(host_load, dsthost->dsthost_refcnt);
+
+ WARN_ON(host_load > CAKE_QUEUES);
+
+ /* flow isolation (DRR++) */
+ if (flow->deficit <= 0) {
+ /* The shifted prandom_u32() is a way to apply dithering to
+ * avoid accumulating roundoff errors
+ */
+ flow->deficit += (b->flow_quantum * quantum_div[host_load] +
+ (prandom_u32() >> 16)) >> 16;
+ list_move_tail(&flow->flowchain, &b->old_flows);
+
+ /* Keep all flows with deficits out of the sparse and decaying
+ * rotations. No non-empty flow can go into the decaying
+ * rotation, so they can't get deficits
+ */
+ if (flow->set == CAKE_SET_SPARSE) {
+ if (flow->head) {
+ b->sparse_flow_count--;
+ b->bulk_flow_count++;
+ flow->set = CAKE_SET_BULK;
+ } else {
+ /* we've moved it to the bulk rotation for
+ * correct deficit accounting but we still want
+ * to count it as a sparse flow, not a bulk one.
+ */
+ flow->set = CAKE_SET_SPARSE_WAIT;
+ }
+ }
+ goto retry;
+ }
+
+ /* Retrieve a packet via the AQM */
+ while (1) {
+ skb = cake_dequeue_one(sch);
+ if (!skb) {
+ /* this queue was actually empty */
+ if (cobalt_queue_empty(&flow->cvars, &b->cparams, now))
+ b->unresponsive_flow_count--;
+
+ if (flow->cvars.p_drop || flow->cvars.count ||
+ ktime_before(now, flow->cvars.drop_next)) {
+ /* keep in the flowchain until the state has
+ * decayed to rest
+ */
+ list_move_tail(&flow->flowchain,
+ &b->decaying_flows);
+ if (flow->set == CAKE_SET_BULK) {
+ b->bulk_flow_count--;
+ b->decaying_flow_count++;
+ } else if (flow->set == CAKE_SET_SPARSE ||
+ flow->set == CAKE_SET_SPARSE_WAIT) {
+ b->sparse_flow_count--;
+ b->decaying_flow_count++;
+ }
+ flow->set = CAKE_SET_DECAYING;
+ } else {
+ /* remove empty queue from the flowchain */
+ list_del_init(&flow->flowchain);
+ if (flow->set == CAKE_SET_SPARSE ||
+ flow->set == CAKE_SET_SPARSE_WAIT)
+ b->sparse_flow_count--;
+ else if (flow->set == CAKE_SET_BULK)
+ b->bulk_flow_count--;
+ else
+ b->decaying_flow_count--;
+
+ flow->set = CAKE_SET_NONE;
+ srchost->srchost_refcnt--;
+ dsthost->dsthost_refcnt--;
+ }
+ goto begin;
+ }
+
+ /* Last packet in queue may be marked, shouldn't be dropped */
+ if (!cobalt_should_drop(&flow->cvars, &b->cparams, now, skb,
+ (b->bulk_flow_count *
+ !!(q->rate_flags &
+ CAKE_FLAG_INGRESS))) ||
+ !flow->head)
+ break;
+
+ /* drop this packet, get another one */
+ if (q->rate_flags & CAKE_FLAG_INGRESS) {
+ len = cake_advance_shaper(q, b, skb,
+ now, true);
+ flow->deficit -= len;
+ b->tin_deficit -= len;
+ }
+ flow->dropped++;
+ b->tin_dropped++;
+ qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
+ qdisc_qstats_drop(sch);
+ kfree_skb(skb);
+ if (q->rate_flags & CAKE_FLAG_INGRESS)
+ goto retry;
+ }
+
+ b->tin_ecn_mark += !!flow->cvars.ecn_marked;
+ qdisc_bstats_update(sch, skb);
+
+ /* collect delay stats */
+ delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
+ b->avge_delay = cake_ewma(b->avge_delay, delay, 8);
+ b->peak_delay = cake_ewma(b->peak_delay, delay,
+ delay > b->peak_delay ? 2 : 8);
+ b->base_delay = cake_ewma(b->base_delay, delay,
+ delay < b->base_delay ? 2 : 8);
+
+ len = cake_advance_shaper(q, b, skb, now, false);
+ flow->deficit -= len;
+ b->tin_deficit -= len;
+
+ if (ktime_after(q->time_next_packet, now) && sch->q.qlen) {
+ u64 next = min(ktime_to_ns(q->time_next_packet),
+ ktime_to_ns(q->failsafe_next_packet));
+
+ qdisc_watchdog_schedule_ns(&q->watchdog, next);
+ } else if (!sch->q.qlen) {
+ int i;
+
+ for (i = 0; i < q->tin_cnt; i++) {
+ if (q->tins[i].decaying_flow_count) {
+ ktime_t next = \
+ ktime_add_ns(now,
+ q->tins[i].cparams.target);
+
+ qdisc_watchdog_schedule_ns(&q->watchdog,
+ ktime_to_ns(next));
+ break;
+ }
+ }
+ }
+
+ if (q->overflow_timeout)
+ q->overflow_timeout--;
+
+ return skb;
+}
+
+static void cake_reset(struct Qdisc *sch)
+{
+ u32 c;
+
+ for (c = 0; c < CAKE_MAX_TINS; c++)
+ cake_clear_tin(sch, c);
+}
+
+static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = {
+ [TCA_CAKE_BASE_RATE64] = { .type = NLA_U64 },
+ [TCA_CAKE_DIFFSERV_MODE] = { .type = NLA_U32 },
+ [TCA_CAKE_ATM] = { .type = NLA_U32 },
+ [TCA_CAKE_FLOW_MODE] = { .type = NLA_U32 },
+ [TCA_CAKE_OVERHEAD] = { .type = NLA_S32 },
+ [TCA_CAKE_RTT] = { .type = NLA_U32 },
+ [TCA_CAKE_TARGET] = { .type = NLA_U32 },
+ [TCA_CAKE_AUTORATE] = { .type = NLA_U32 },
+ [TCA_CAKE_MEMORY] = { .type = NLA_U32 },
+ [TCA_CAKE_NAT] = { .type = NLA_U32 },
+ [TCA_CAKE_RAW] = { .type = NLA_U32 },
+ [TCA_CAKE_WASH] = { .type = NLA_U32 },
+ [TCA_CAKE_MPU] = { .type = NLA_U32 },
+ [TCA_CAKE_INGRESS] = { .type = NLA_U32 },
+ [TCA_CAKE_ACK_FILTER] = { .type = NLA_U32 },
+};
+
+static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu,
+ u64 target_ns, u64 rtt_est_ns)
+{
+ /* convert byte-rate into time-per-byte
+ * so it will always unwedge in reasonable time.
+ */
+ static const u64 MIN_RATE = 64;
+ u32 byte_target = mtu;
+ u64 byte_target_ns;
+ u8 rate_shft = 0;
+ u64 rate_ns = 0;
+
+ b->flow_quantum = 1514;
+ if (rate) {
+ b->flow_quantum = max(min(rate >> 12, 1514ULL), 300ULL);
+ rate_shft = 34;
+ rate_ns = ((u64)NSEC_PER_SEC) << rate_shft;
+ rate_ns = div64_u64(rate_ns, max(MIN_RATE, rate));
+ while (!!(rate_ns >> 34)) {
+ rate_ns >>= 1;
+ rate_shft--;
+ }
+ } /* else unlimited, ie. zero delay */
+
+ b->tin_rate_bps = rate;
+ b->tin_rate_ns = rate_ns;
+ b->tin_rate_shft = rate_shft;
+
+ byte_target_ns = (byte_target * rate_ns) >> rate_shft;
+
+ b->cparams.target = max((byte_target_ns * 3) / 2, target_ns);
+ b->cparams.interval = max(rtt_est_ns +
+ b->cparams.target - target_ns,
+ b->cparams.target * 2);
+ b->cparams.mtu_time = byte_target_ns;
+ b->cparams.p_inc = 1 << 24; /* 1/256 */
+ b->cparams.p_dec = 1 << 20; /* 1/4096 */
+}
+
+static int cake_config_besteffort(struct Qdisc *sch)
+{
+ struct cake_sched_data *q = qdisc_priv(sch);
+ struct cake_tin_data *b = &q->tins[0];
+ u32 mtu = psched_mtu(qdisc_dev(sch));
+ u64 rate = q->rate_bps;
+
+ q->tin_cnt = 1;
+
+ q->tin_index = besteffort;
+ q->tin_order = normal_order;
+
+ cake_set_rate(b, rate, mtu,
+ us_to_ns(q->target), us_to_ns(q->interval));
+ b->tin_quantum_band = 65535;
+ b->tin_quantum_prio = 65535;
+
+ return 0;
+}
+
+static int cake_config_precedence(struct Qdisc *sch)
+{
+ /* convert high-level (user visible) parameters into internal format */
+ struct cake_sched_data *q = qdisc_priv(sch);
+ u32 mtu = psched_mtu(qdisc_dev(sch));
+ u64 rate = q->rate_bps;
+ u32 quantum1 = 256;
+ u32 quantum2 = 256;
+ u32 i;
+
+ q->tin_cnt = 8;
+ q->tin_index = precedence;
+ q->tin_order = normal_order;
+
+ for (i = 0; i < q->tin_cnt; i++) {
+ struct cake_tin_data *b = &q->tins[i];
+
+ cake_set_rate(b, rate, mtu, us_to_ns(q->target),
+ us_to_ns(q->interval));
+
+ b->tin_quantum_prio = max_t(u16, 1U, quantum1);
+ b->tin_quantum_band = max_t(u16, 1U, quantum2);
+
+ /* calculate next class's parameters */
+ rate *= 7;
+ rate >>= 3;
+
+ quantum1 *= 3;
+ quantum1 >>= 1;
+
+ quantum2 *= 7;
+ quantum2 >>= 3;
+ }
+
+ return 0;
+}
+
+/* List of known Diffserv codepoints:
+ *
+ * Least Effort (CS1)
+ * Best Effort (CS0)
+ * Max Reliability & LLT "Lo" (TOS1)
+ * Max Throughput (TOS2)
+ * Min Delay (TOS4)
+ * LLT "La" (TOS5)
+ * Assured Forwarding 1 (AF1x) - x3
+ * Assured Forwarding 2 (AF2x) - x3
+ * Assured Forwarding 3 (AF3x) - x3
+ * Assured Forwarding 4 (AF4x) - x3
+ * Precedence Class 2 (CS2)
+ * Precedence Class 3 (CS3)
+ * Precedence Class 4 (CS4)
+ * Precedence Class 5 (CS5)
+ * Precedence Class 6 (CS6)
+ * Precedence Class 7 (CS7)
+ * Voice Admit (VA)
+ * Expedited Forwarding (EF)
+
+ * Total 25 codepoints.
+ */
+
+/* List of traffic classes in RFC 4594:
+ * (roughly descending order of contended priority)
+ * (roughly ascending order of uncontended throughput)
+ *
+ * Network Control (CS6,CS7) - routing traffic
+ * Telephony (EF,VA) - aka. VoIP streams
+ * Signalling (CS5) - VoIP setup
+ * Multimedia Conferencing (AF4x) - aka. video calls
+ * Realtime Interactive (CS4) - eg. games
+ * Multimedia Streaming (AF3x) - eg. YouTube, NetFlix, Twitch
+ * Broadcast Video (CS3)
+ * Low Latency Data (AF2x,TOS4) - eg. database
+ * Ops, Admin, Management (CS2,TOS1) - eg. ssh
+ * Standard Service (CS0 & unrecognised codepoints)
+ * High Throughput Data (AF1x,TOS2) - eg. web traffic
+ * Low Priority Data (CS1) - eg. BitTorrent
+
+ * Total 12 traffic classes.
+ */
+
+static int cake_config_diffserv8(struct Qdisc *sch)
+{
+/* Pruned list of traffic classes for typical applications:
+ *
+ * Network Control (CS6, CS7)
+ * Minimum Latency (EF, VA, CS5, CS4)
+ * Interactive Shell (CS2, TOS1)
+ * Low Latency Transactions (AF2x, TOS4)
+ * Video Streaming (AF4x, AF3x, CS3)
+ * Bog Standard (CS0 etc.)
+ * High Throughput (AF1x, TOS2)
+ * Background Traffic (CS1)
+ *
+ * Total 8 traffic classes.
+ */
+
+ struct cake_sched_data *q = qdisc_priv(sch);
+ u32 mtu = psched_mtu(qdisc_dev(sch));
+ u64 rate = q->rate_bps;
+ u32 quantum1 = 256;
+ u32 quantum2 = 256;
+ u32 i;
+
+ q->tin_cnt = 8;
+
+ /* codepoint to class mapping */
+ q->tin_index = diffserv8;
+ q->tin_order = normal_order;
+
+ /* class characteristics */
+ for (i = 0; i < q->tin_cnt; i++) {
+ struct cake_tin_data *b = &q->tins[i];
+
+ cake_set_rate(b, rate, mtu, us_to_ns(q->target),
+ us_to_ns(q->interval));
+
+ b->tin_quantum_prio = max_t(u16, 1U, quantum1);
+ b->tin_quantum_band = max_t(u16, 1U, quantum2);
+
+ /* calculate next class's parameters */
+ rate *= 7;
+ rate >>= 3;
+
+ quantum1 *= 3;
+ quantum1 >>= 1;
+
+ quantum2 *= 7;
+ quantum2 >>= 3;
+ }
+
+ return 0;
+}
+
+static int cake_config_diffserv4(struct Qdisc *sch)
+{
+/* Further pruned list of traffic classes for four-class system:
+ *
+ * Latency Sensitive (CS7, CS6, EF, VA, CS5, CS4)
+ * Streaming Media (AF4x, AF3x, CS3, AF2x, TOS4, CS2, TOS1)
+ * Best Effort (CS0, AF1x, TOS2, and those not specified)
+ * Background Traffic (CS1)
+ *
+ * Total 4 traffic classes.
+ */
+
+ struct cake_sched_data *q = qdisc_priv(sch);
+ u32 mtu = psched_mtu(qdisc_dev(sch));
+ u64 rate = q->rate_bps;
+ u32 quantum = 1024;
+
+ q->tin_cnt = 4;
+
+ /* codepoint to class mapping */
+ q->tin_index = diffserv4;
+ q->tin_order = bulk_order;
+
+ /* class characteristics */
+ cake_set_rate(&q->tins[0], rate, mtu,
+ us_to_ns(q->target), us_to_ns(q->interval));
+ cake_set_rate(&q->tins[1], rate >> 4, mtu,
+ us_to_ns(q->target), us_to_ns(q->interval));
+ cake_set_rate(&q->tins[2], rate >> 1, mtu,
+ us_to_ns(q->target), us_to_ns(q->interval));
+ cake_set_rate(&q->tins[3], rate >> 2, mtu,
+ us_to_ns(q->target), us_to_ns(q->interval));
+
+ /* priority weights */
+ q->tins[0].tin_quantum_prio = quantum;
+ q->tins[1].tin_quantum_prio = quantum >> 4;
+ q->tins[2].tin_quantum_prio = quantum << 2;
+ q->tins[3].tin_quantum_prio = quantum << 4;
+
+ /* bandwidth-sharing weights */
+ q->tins[0].tin_quantum_band = quantum;
+ q->tins[1].tin_quantum_band = quantum >> 4;
+ q->tins[2].tin_quantum_band = quantum >> 1;
+ q->tins[3].tin_quantum_band = quantum >> 2;
+
+ return 0;
+}
+
+static int cake_config_diffserv3(struct Qdisc *sch)
+{
+/* Simplified Diffserv structure with 3 tins.
+ * Low Priority (CS1)
+ * Best Effort
+ * Latency Sensitive (TOS4, VA, EF, CS6, CS7)
+ */
+ struct cake_sched_data *q = qdisc_priv(sch);
+ u32 mtu = psched_mtu(qdisc_dev(sch));
+ u64 rate = q->rate_bps;
+ u32 quantum = 1024;
+
+ q->tin_cnt = 3;
+
+ /* codepoint to class mapping */
+ q->tin_index = diffserv3;
+ q->tin_order = bulk_order;
+
+ /* class characteristics */
+ cake_set_rate(&q->tins[0], rate, mtu,
+ us_to_ns(q->target), us_to_ns(q->interval));
+ cake_set_rate(&q->tins[1], rate >> 4, mtu,
+ us_to_ns(q->target), us_to_ns(q->interval));
+ cake_set_rate(&q->tins[2], rate >> 2, mtu,
+ us_to_ns(q->target), us_to_ns(q->interval));
+
+ /* priority weights */
+ q->tins[0].tin_quantum_prio = quantum;
+ q->tins[1].tin_quantum_prio = quantum >> 4;
+ q->tins[2].tin_quantum_prio = quantum << 4;
+
+ /* bandwidth-sharing weights */
+ q->tins[0].tin_quantum_band = quantum;
+ q->tins[1].tin_quantum_band = quantum >> 4;
+ q->tins[2].tin_quantum_band = quantum >> 2;
+
+ return 0;
+}
+
+static void cake_reconfigure(struct Qdisc *sch)
+{
+ struct cake_sched_data *q = qdisc_priv(sch);
+ int c, ft;
+
+ switch (q->tin_mode) {
+ case CAKE_DIFFSERV_BESTEFFORT:
+ ft = cake_config_besteffort(sch);
+ break;
+
+ case CAKE_DIFFSERV_PRECEDENCE:
+ ft = cake_config_precedence(sch);
+ break;
+
+ case CAKE_DIFFSERV_DIFFSERV8:
+ ft = cake_config_diffserv8(sch);
+ break;
+
+ case CAKE_DIFFSERV_DIFFSERV4:
+ ft = cake_config_diffserv4(sch);
+ break;
+
+ case CAKE_DIFFSERV_DIFFSERV3:
+ default:
+ ft = cake_config_diffserv3(sch);
+ break;
+ }
+
+ for (c = q->tin_cnt; c < CAKE_MAX_TINS; c++) {
+ cake_clear_tin(sch, c);
+ q->tins[c].cparams.mtu_time = q->tins[ft].cparams.mtu_time;
+ }
+
+ q->rate_ns = q->tins[ft].tin_rate_ns;
+ q->rate_shft = q->tins[ft].tin_rate_shft;
+
+ if (q->buffer_config_limit) {
+ q->buffer_limit = q->buffer_config_limit;
+ } else if (q->rate_bps) {
+ u64 t = q->rate_bps * q->interval;
+
+ do_div(t, USEC_PER_SEC / 4);
+ q->buffer_limit = max_t(u32, t, 4U << 20);
+ } else {
+ q->buffer_limit = ~0;
+ }
+
+ sch->flags &= ~TCQ_F_CAN_BYPASS;
+
+ q->buffer_limit = min(q->buffer_limit,
+ max(sch->limit * psched_mtu(qdisc_dev(sch)),
+ q->buffer_config_limit));
+}
+
+static int cake_change(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct cake_sched_data *q = qdisc_priv(sch);
+ struct nlattr *tb[TCA_CAKE_MAX + 1];
+ int err;
+
+ if (!opt)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_CAKE_MAX, opt, cake_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_CAKE_NAT]) {
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+ q->flow_mode &= ~CAKE_FLOW_NAT_FLAG;
+ q->flow_mode |= CAKE_FLOW_NAT_FLAG *
+ !!nla_get_u32(tb[TCA_CAKE_NAT]);
+#else
+ NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT],
+ "No conntrack support in kernel");
+ return -EOPNOTSUPP;
+#endif
+ }
+
+ if (tb[TCA_CAKE_BASE_RATE64])
+ q->rate_bps = nla_get_u64(tb[TCA_CAKE_BASE_RATE64]);
+
+ if (tb[TCA_CAKE_DIFFSERV_MODE])
+ q->tin_mode = nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE]);
+
+ if (tb[TCA_CAKE_WASH]) {
+ if (!!nla_get_u32(tb[TCA_CAKE_WASH]))
+ q->rate_flags |= CAKE_FLAG_WASH;
+ else
+ q->rate_flags &= ~CAKE_FLAG_WASH;
+ }
+
+ if (tb[TCA_CAKE_FLOW_MODE])
+ q->flow_mode = ((q->flow_mode & CAKE_FLOW_NAT_FLAG) |
+ (nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) &
+ CAKE_FLOW_MASK));
+
+ if (tb[TCA_CAKE_ATM])
+ q->atm_mode = nla_get_u32(tb[TCA_CAKE_ATM]);
+
+ if (tb[TCA_CAKE_OVERHEAD]) {
+ q->rate_overhead = nla_get_s32(tb[TCA_CAKE_OVERHEAD]);
+ q->rate_flags |= CAKE_FLAG_OVERHEAD;
+
+ q->max_netlen = 0;
+ q->max_adjlen = 0;
+ q->min_netlen = ~0;
+ q->min_adjlen = ~0;
+ }
+
+ if (tb[TCA_CAKE_RAW]) {
+ q->rate_flags &= ~CAKE_FLAG_OVERHEAD;
+
+ q->max_netlen = 0;
+ q->max_adjlen = 0;
+ q->min_netlen = ~0;
+ q->min_adjlen = ~0;
+ }
+
+ if (tb[TCA_CAKE_MPU])
+ q->rate_mpu = nla_get_u32(tb[TCA_CAKE_MPU]);
+
+ if (tb[TCA_CAKE_RTT]) {
+ q->interval = nla_get_u32(tb[TCA_CAKE_RTT]);
+
+ if (!q->interval)
+ q->interval = 1;
+ }
+
+ if (tb[TCA_CAKE_TARGET]) {
+ q->target = nla_get_u32(tb[TCA_CAKE_TARGET]);
+
+ if (!q->target)
+ q->target = 1;
+ }
+
+ if (tb[TCA_CAKE_AUTORATE]) {
+ if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE]))
+ q->rate_flags |= CAKE_FLAG_AUTORATE_INGRESS;
+ else
+ q->rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS;
+ }
+
+ if (tb[TCA_CAKE_INGRESS]) {
+ if (!!nla_get_u32(tb[TCA_CAKE_INGRESS]))
+ q->rate_flags |= CAKE_FLAG_INGRESS;
+ else
+ q->rate_flags &= ~CAKE_FLAG_INGRESS;
+ }
+
+ if (tb[TCA_CAKE_ACK_FILTER])
+ q->ack_filter = nla_get_u32(tb[TCA_CAKE_ACK_FILTER]);
+
+ if (tb[TCA_CAKE_MEMORY])
+ q->buffer_config_limit = nla_get_u32(tb[TCA_CAKE_MEMORY]);
+
+ if (q->rate_bps && q->rate_bps <= CAKE_SPLIT_GSO_THRESHOLD)
+ q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
+ else
+ q->rate_flags &= ~CAKE_FLAG_SPLIT_GSO;
+
+ if (q->tins) {
+ sch_tree_lock(sch);
+ cake_reconfigure(sch);
+ sch_tree_unlock(sch);
+ }
+
+ return 0;
+}
+
+static void cake_destroy(struct Qdisc *sch)
+{
+ struct cake_sched_data *q = qdisc_priv(sch);
+
+ qdisc_watchdog_cancel(&q->watchdog);
+ tcf_block_put(q->block);
+ kvfree(q->tins);
+}
+
+static int cake_init(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct cake_sched_data *q = qdisc_priv(sch);
+ int i, j, err;
+
+ sch->limit = 10240;
+ q->tin_mode = CAKE_DIFFSERV_DIFFSERV3;
+ q->flow_mode = CAKE_FLOW_TRIPLE;
+
+ q->rate_bps = 0; /* unlimited by default */
+
+ q->interval = 100000; /* 100ms default */
+ q->target = 5000; /* 5ms: codel RFC argues
+ * for 5 to 10% of interval
+ */
+
+ q->cur_tin = 0;
+ q->cur_flow = 0;
+
+ qdisc_watchdog_init(&q->watchdog, sch);
+
+ if (opt) {
+ int err = cake_change(sch, opt, extack);
+
+ if (err)
+ return err;
+ }
+
+ err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
+ if (err)
+ return err;
+
+ quantum_div[0] = ~0;
+ for (i = 1; i <= CAKE_QUEUES; i++)
+ quantum_div[i] = 65535 / i;
+
+ q->tins = kvzalloc(CAKE_MAX_TINS * sizeof(struct cake_tin_data),
+ GFP_KERNEL);
+ if (!q->tins)
+ goto nomem;
+
+ for (i = 0; i < CAKE_MAX_TINS; i++) {
+ struct cake_tin_data *b = q->tins + i;
+
+ INIT_LIST_HEAD(&b->new_flows);
+ INIT_LIST_HEAD(&b->old_flows);
+ INIT_LIST_HEAD(&b->decaying_flows);
+ b->sparse_flow_count = 0;
+ b->bulk_flow_count = 0;
+ b->decaying_flow_count = 0;
+
+ for (j = 0; j < CAKE_QUEUES; j++) {
+ struct cake_flow *flow = b->flows + j;
+ u32 k = j * CAKE_MAX_TINS + i;
+
+ INIT_LIST_HEAD(&flow->flowchain);
+ cobalt_vars_init(&flow->cvars);
+
+ q->overflow_heap[k].t = i;
+ q->overflow_heap[k].b = j;
+ b->overflow_idx[j] = k;
+ }
+ }
+
+ cake_reconfigure(sch);
+ q->avg_peak_bandwidth = q->rate_bps;
+ q->min_netlen = ~0;
+ q->min_adjlen = ~0;
+ return 0;
+
+nomem:
+ cake_destroy(sch);
+ return -ENOMEM;
+}
+
+static int cake_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct cake_sched_data *q = qdisc_priv(sch);
+ struct nlattr *opts;
+
+ opts = nla_nest_start(skb, TCA_OPTIONS);
+ if (!opts)
+ goto nla_put_failure;
+
+ if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64, q->rate_bps,
+ TCA_CAKE_PAD))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE,
+ q->flow_mode & CAKE_FLOW_MASK))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_CAKE_RTT, q->interval))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_CAKE_TARGET, q->target))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_CAKE_MEMORY, q->buffer_config_limit))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_CAKE_AUTORATE,
+ !!(q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS)))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_CAKE_INGRESS,
+ !!(q->rate_flags & CAKE_FLAG_INGRESS)))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, q->ack_filter))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_CAKE_NAT,
+ !!(q->flow_mode & CAKE_FLOW_NAT_FLAG)))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, q->tin_mode))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_CAKE_WASH,
+ !!(q->rate_flags & CAKE_FLAG_WASH)))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, q->rate_overhead))
+ goto nla_put_failure;
+
+ if (!(q->rate_flags & CAKE_FLAG_OVERHEAD))
+ if (nla_put_u32(skb, TCA_CAKE_RAW, 0))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_CAKE_ATM, q->atm_mode))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_CAKE_MPU, q->rate_mpu))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_CAKE_SPLIT_GSO,
+ !!(q->rate_flags & CAKE_FLAG_SPLIT_GSO)))
+ goto nla_put_failure;
+
+ return nla_nest_end(skb, opts);
+
+nla_put_failure:
+ return -1;
+}
+
+static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+ struct nlattr *stats = nla_nest_start(d->skb, TCA_STATS_APP);
+ struct cake_sched_data *q = qdisc_priv(sch);
+ struct nlattr *tstats, *ts;
+ int i;
+
+ if (!stats)
+ return -1;
+
+#define PUT_STAT_U32(attr, data) do { \
+ if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
+ goto nla_put_failure; \
+ } while (0)
+#define PUT_STAT_U64(attr, data) do { \
+ if (nla_put_u64_64bit(d->skb, TCA_CAKE_STATS_ ## attr, \
+ data, TCA_CAKE_STATS_PAD)) \
+ goto nla_put_failure; \
+ } while (0)
+
+ PUT_STAT_U64(CAPACITY_ESTIMATE64, q->avg_peak_bandwidth);
+ PUT_STAT_U32(MEMORY_LIMIT, q->buffer_limit);
+ PUT_STAT_U32(MEMORY_USED, q->buffer_max_used);
+ PUT_STAT_U32(AVG_NETOFF, ((q->avg_netoff + 0x8000) >> 16));
+ PUT_STAT_U32(MAX_NETLEN, q->max_netlen);
+ PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen);
+ PUT_STAT_U32(MIN_NETLEN, q->min_netlen);
+ PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen);
+
+#undef PUT_STAT_U32
+#undef PUT_STAT_U64
+
+ tstats = nla_nest_start(d->skb, TCA_CAKE_STATS_TIN_STATS);
+ if (!tstats)
+ goto nla_put_failure;
+
+#define PUT_TSTAT_U32(attr, data) do { \
+ if (nla_put_u32(d->skb, TCA_CAKE_TIN_STATS_ ## attr, data)) \
+ goto nla_put_failure; \
+ } while (0)
+#define PUT_TSTAT_U64(attr, data) do { \
+ if (nla_put_u64_64bit(d->skb, TCA_CAKE_TIN_STATS_ ## attr, \
+ data, TCA_CAKE_TIN_STATS_PAD)) \
+ goto nla_put_failure; \
+ } while (0)
+
+ for (i = 0; i < q->tin_cnt; i++) {
+ struct cake_tin_data *b = &q->tins[q->tin_order[i]];
+
+ ts = nla_nest_start(d->skb, i + 1);
+ if (!ts)
+ goto nla_put_failure;
+
+ PUT_TSTAT_U64(THRESHOLD_RATE64, b->tin_rate_bps);
+ PUT_TSTAT_U64(SENT_BYTES64, b->bytes);
+ PUT_TSTAT_U32(BACKLOG_BYTES, b->tin_backlog);
+
+ PUT_TSTAT_U32(TARGET_US,
+ ktime_to_us(ns_to_ktime(b->cparams.target)));
+ PUT_TSTAT_U32(INTERVAL_US,
+ ktime_to_us(ns_to_ktime(b->cparams.interval)));
+
+ PUT_TSTAT_U32(SENT_PACKETS, b->packets);
+ PUT_TSTAT_U32(DROPPED_PACKETS, b->tin_dropped);
+ PUT_TSTAT_U32(ECN_MARKED_PACKETS, b->tin_ecn_mark);
+ PUT_TSTAT_U32(ACKS_DROPPED_PACKETS, b->ack_drops);
+
+ PUT_TSTAT_U32(PEAK_DELAY_US,
+ ktime_to_us(ns_to_ktime(b->peak_delay)));
+ PUT_TSTAT_U32(AVG_DELAY_US,
+ ktime_to_us(ns_to_ktime(b->avge_delay)));
+ PUT_TSTAT_U32(BASE_DELAY_US,
+ ktime_to_us(ns_to_ktime(b->base_delay)));
+
+ PUT_TSTAT_U32(WAY_INDIRECT_HITS, b->way_hits);
+ PUT_TSTAT_U32(WAY_MISSES, b->way_misses);
+ PUT_TSTAT_U32(WAY_COLLISIONS, b->way_collisions);
+
+ PUT_TSTAT_U32(SPARSE_FLOWS, b->sparse_flow_count +
+ b->decaying_flow_count);
+ PUT_TSTAT_U32(BULK_FLOWS, b->bulk_flow_count);
+ PUT_TSTAT_U32(UNRESPONSIVE_FLOWS, b->unresponsive_flow_count);
+ PUT_TSTAT_U32(MAX_SKBLEN, b->max_skblen);
+
+ PUT_TSTAT_U32(FLOW_QUANTUM, b->flow_quantum);
+ nla_nest_end(d->skb, ts);
+ }
+
+#undef PUT_TSTAT_U32
+#undef PUT_TSTAT_U64
+
+ nla_nest_end(d->skb, tstats);
+ return nla_nest_end(d->skb, stats);
+
+nla_put_failure:
+ nla_nest_cancel(d->skb, stats);
+ return -1;
+}
+
+static struct Qdisc *cake_leaf(struct Qdisc *sch, unsigned long arg)
+{
+ return NULL;
+}
+
+static unsigned long cake_find(struct Qdisc *sch, u32 classid)
+{
+ return 0;
+}
+
+static unsigned long cake_bind(struct Qdisc *sch, unsigned long parent,
+ u32 classid)
+{
+ return 0;
+}
+
+static void cake_unbind(struct Qdisc *q, unsigned long cl)
+{
+}
+
+static struct tcf_block *cake_tcf_block(struct Qdisc *sch, unsigned long cl,
+ struct netlink_ext_ack *extack)
+{
+ struct cake_sched_data *q = qdisc_priv(sch);
+
+ if (cl)
+ return NULL;
+ return q->block;
+}
+
+static int cake_dump_class(struct Qdisc *sch, unsigned long cl,
+ struct sk_buff *skb, struct tcmsg *tcm)
+{
+ tcm->tcm_handle |= TC_H_MIN(cl);
+ return 0;
+}
+
+static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl,
+ struct gnet_dump *d)
+{
+ struct cake_sched_data *q = qdisc_priv(sch);
+ const struct cake_flow *flow = NULL;
+ struct gnet_stats_queue qs = { 0 };
+ struct nlattr *stats;
+ u32 idx = cl - 1;
+
+ if (idx < CAKE_QUEUES * q->tin_cnt) {
+ const struct cake_tin_data *b = \
+ &q->tins[q->tin_order[idx / CAKE_QUEUES]];
+ const struct sk_buff *skb;
+
+ flow = &b->flows[idx % CAKE_QUEUES];
+
+ if (flow->head) {
+ sch_tree_lock(sch);
+ skb = flow->head;
+ while (skb) {
+ qs.qlen++;
+ skb = skb->next;
+ }
+ sch_tree_unlock(sch);
+ }
+ qs.backlog = b->backlogs[idx % CAKE_QUEUES];
+ qs.drops = flow->dropped;
+ }
+ if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
+ return -1;
+ if (flow) {
+ ktime_t now = ktime_get();
+
+ stats = nla_nest_start(d->skb, TCA_STATS_APP);
+ if (!stats)
+ return -1;
+
+#define PUT_STAT_U32(attr, data) do { \
+ if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
+ goto nla_put_failure; \
+ } while (0)
+#define PUT_STAT_S32(attr, data) do { \
+ if (nla_put_s32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
+ goto nla_put_failure; \
+ } while (0)
+
+ PUT_STAT_S32(DEFICIT, flow->deficit);
+ PUT_STAT_U32(DROPPING, flow->cvars.dropping);
+ PUT_STAT_U32(COBALT_COUNT, flow->cvars.count);
+ PUT_STAT_U32(P_DROP, flow->cvars.p_drop);
+ if (flow->cvars.p_drop) {
+ PUT_STAT_S32(BLUE_TIMER_US,
+ ktime_to_us(
+ ktime_sub(now,
+ flow->cvars.blue_timer)));
+ }
+ if (flow->cvars.dropping) {
+ PUT_STAT_S32(DROP_NEXT_US,
+ ktime_to_us(
+ ktime_sub(now,
+ flow->cvars.drop_next)));
+ }
+
+ if (nla_nest_end(d->skb, stats) < 0)
+ return -1;
+ }
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(d->skb, stats);
+ return -1;
+}
+
+static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+ struct cake_sched_data *q = qdisc_priv(sch);
+ unsigned int i, j;
+
+ if (arg->stop)
+ return;
+
+ for (i = 0; i < q->tin_cnt; i++) {
+ struct cake_tin_data *b = &q->tins[q->tin_order[i]];
+
+ for (j = 0; j < CAKE_QUEUES; j++) {
+ if (list_empty(&b->flows[j].flowchain) ||
+ arg->count < arg->skip) {
+ arg->count++;
+ continue;
+ }
+ if (arg->fn(sch, i * CAKE_QUEUES + j + 1, arg) < 0) {
+ arg->stop = 1;
+ break;
+ }
+ arg->count++;
+ }
+ }
+}
+
+static const struct Qdisc_class_ops cake_class_ops = {
+ .leaf = cake_leaf,
+ .find = cake_find,
+ .tcf_block = cake_tcf_block,
+ .bind_tcf = cake_bind,
+ .unbind_tcf = cake_unbind,
+ .dump = cake_dump_class,
+ .dump_stats = cake_dump_class_stats,
+ .walk = cake_walk,
+};
+
+static struct Qdisc_ops cake_qdisc_ops __read_mostly = {
+ .cl_ops = &cake_class_ops,
+ .id = "cake",
+ .priv_size = sizeof(struct cake_sched_data),
+ .enqueue = cake_enqueue,
+ .dequeue = cake_dequeue,
+ .peek = qdisc_peek_dequeued,
+ .init = cake_init,
+ .reset = cake_reset,
+ .destroy = cake_destroy,
+ .change = cake_change,
+ .dump = cake_dump,
+ .dump_stats = cake_dump_stats,
+ .owner = THIS_MODULE,
+};
+
+static int __init cake_module_init(void)
+{
+ return register_qdisc(&cake_qdisc_ops);
+}
+
+static void __exit cake_module_exit(void)
+{
+ unregister_qdisc(&cake_qdisc_ops);
+}
+
+module_init(cake_module_init)
+module_exit(cake_module_exit)
+MODULE_AUTHOR("Jonathan Morton");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("The CAKE shaper.");
diff --git a/net/sched/sch_etf.c b/net/sched/sch_etf.c
new file mode 100644
index 000000000000..1538d6fa8165
--- /dev/null
+++ b/net/sched/sch_etf.c
@@ -0,0 +1,484 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/* net/sched/sch_etf.c Earliest TxTime First queueing discipline.
+ *
+ * Authors: Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com>
+ * Vinicius Costa Gomes <vinicius.gomes@intel.com>
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/errqueue.h>
+#include <linux/rbtree.h>
+#include <linux/skbuff.h>
+#include <linux/posix-timers.h>
+#include <net/netlink.h>
+#include <net/sch_generic.h>
+#include <net/pkt_sched.h>
+#include <net/sock.h>
+
+#define DEADLINE_MODE_IS_ON(x) ((x)->flags & TC_ETF_DEADLINE_MODE_ON)
+#define OFFLOAD_IS_ON(x) ((x)->flags & TC_ETF_OFFLOAD_ON)
+
+struct etf_sched_data {
+ bool offload;
+ bool deadline_mode;
+ int clockid;
+ int queue;
+ s32 delta; /* in ns */
+ ktime_t last; /* The txtime of the last skb sent to the netdevice. */
+ struct rb_root head;
+ struct qdisc_watchdog watchdog;
+ ktime_t (*get_time)(void);
+};
+
+static const struct nla_policy etf_policy[TCA_ETF_MAX + 1] = {
+ [TCA_ETF_PARMS] = { .len = sizeof(struct tc_etf_qopt) },
+};
+
+static inline int validate_input_params(struct tc_etf_qopt *qopt,
+ struct netlink_ext_ack *extack)
+{
+ /* Check if params comply to the following rules:
+ * * Clockid and delta must be valid.
+ *
+ * * Dynamic clockids are not supported.
+ *
+ * * Delta must be a positive integer.
+ *
+ * Also note that for the HW offload case, we must
+ * expect that system clocks have been synchronized to PHC.
+ */
+ if (qopt->clockid < 0) {
+ NL_SET_ERR_MSG(extack, "Dynamic clockids are not supported");
+ return -ENOTSUPP;
+ }
+
+ if (qopt->clockid != CLOCK_TAI) {
+ NL_SET_ERR_MSG(extack, "Invalid clockid. CLOCK_TAI must be used");
+ return -EINVAL;
+ }
+
+ if (qopt->delta < 0) {
+ NL_SET_ERR_MSG(extack, "Delta must be positive");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static bool is_packet_valid(struct Qdisc *sch, struct sk_buff *nskb)
+{
+ struct etf_sched_data *q = qdisc_priv(sch);
+ ktime_t txtime = nskb->tstamp;
+ struct sock *sk = nskb->sk;
+ ktime_t now;
+
+ if (!sk)
+ return false;
+
+ if (!sock_flag(sk, SOCK_TXTIME))
+ return false;
+
+ /* We don't perform crosstimestamping.
+ * Drop if packet's clockid differs from qdisc's.
+ */
+ if (sk->sk_clockid != q->clockid)
+ return false;
+
+ if (sk->sk_txtime_deadline_mode != q->deadline_mode)
+ return false;
+
+ now = q->get_time();
+ if (ktime_before(txtime, now) || ktime_before(txtime, q->last))
+ return false;
+
+ return true;
+}
+
+static struct sk_buff *etf_peek_timesortedlist(struct Qdisc *sch)
+{
+ struct etf_sched_data *q = qdisc_priv(sch);
+ struct rb_node *p;
+
+ p = rb_first(&q->head);
+ if (!p)
+ return NULL;
+
+ return rb_to_skb(p);
+}
+
+static void reset_watchdog(struct Qdisc *sch)
+{
+ struct etf_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *skb = etf_peek_timesortedlist(sch);
+ ktime_t next;
+
+ if (!skb)
+ return;
+
+ next = ktime_sub_ns(skb->tstamp, q->delta);
+ qdisc_watchdog_schedule_ns(&q->watchdog, ktime_to_ns(next));
+}
+
+static void report_sock_error(struct sk_buff *skb, u32 err, u8 code)
+{
+ struct sock_exterr_skb *serr;
+ struct sk_buff *clone;
+ ktime_t txtime = skb->tstamp;
+
+ if (!skb->sk || !(skb->sk->sk_txtime_report_errors))
+ return;
+
+ clone = skb_clone(skb, GFP_ATOMIC);
+ if (!clone)
+ return;
+
+ serr = SKB_EXT_ERR(clone);
+ serr->ee.ee_errno = err;
+ serr->ee.ee_origin = SO_EE_ORIGIN_TXTIME;
+ serr->ee.ee_type = 0;
+ serr->ee.ee_code = code;
+ serr->ee.ee_pad = 0;
+ serr->ee.ee_data = (txtime >> 32); /* high part of tstamp */
+ serr->ee.ee_info = txtime; /* low part of tstamp */
+
+ if (sock_queue_err_skb(skb->sk, clone))
+ kfree_skb(clone);
+}
+
+static int etf_enqueue_timesortedlist(struct sk_buff *nskb, struct Qdisc *sch,
+ struct sk_buff **to_free)
+{
+ struct etf_sched_data *q = qdisc_priv(sch);
+ struct rb_node **p = &q->head.rb_node, *parent = NULL;
+ ktime_t txtime = nskb->tstamp;
+
+ if (!is_packet_valid(sch, nskb)) {
+ report_sock_error(nskb, EINVAL,
+ SO_EE_CODE_TXTIME_INVALID_PARAM);
+ return qdisc_drop(nskb, sch, to_free);
+ }
+
+ while (*p) {
+ struct sk_buff *skb;
+
+ parent = *p;
+ skb = rb_to_skb(parent);
+ if (ktime_after(txtime, skb->tstamp))
+ p = &parent->rb_right;
+ else
+ p = &parent->rb_left;
+ }
+ rb_link_node(&nskb->rbnode, parent, p);
+ rb_insert_color(&nskb->rbnode, &q->head);
+
+ qdisc_qstats_backlog_inc(sch, nskb);
+ sch->q.qlen++;
+
+ /* Now we may need to re-arm the qdisc watchdog for the next packet. */
+ reset_watchdog(sch);
+
+ return NET_XMIT_SUCCESS;
+}
+
+static void timesortedlist_erase(struct Qdisc *sch, struct sk_buff *skb,
+ bool drop)
+{
+ struct etf_sched_data *q = qdisc_priv(sch);
+
+ rb_erase(&skb->rbnode, &q->head);
+
+ /* The rbnode field in the skb re-uses these fields, now that
+ * we are done with the rbnode, reset them.
+ */
+ skb->next = NULL;
+ skb->prev = NULL;
+ skb->dev = qdisc_dev(sch);
+
+ qdisc_qstats_backlog_dec(sch, skb);
+
+ if (drop) {
+ struct sk_buff *to_free = NULL;
+
+ report_sock_error(skb, ECANCELED, SO_EE_CODE_TXTIME_MISSED);
+
+ qdisc_drop(skb, sch, &to_free);
+ kfree_skb_list(to_free);
+ qdisc_qstats_overlimit(sch);
+ } else {
+ qdisc_bstats_update(sch, skb);
+
+ q->last = skb->tstamp;
+ }
+
+ sch->q.qlen--;
+}
+
+static struct sk_buff *etf_dequeue_timesortedlist(struct Qdisc *sch)
+{
+ struct etf_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *skb;
+ ktime_t now, next;
+
+ skb = etf_peek_timesortedlist(sch);
+ if (!skb)
+ return NULL;
+
+ now = q->get_time();
+
+ /* Drop if packet has expired while in queue. */
+ if (ktime_before(skb->tstamp, now)) {
+ timesortedlist_erase(sch, skb, true);
+ skb = NULL;
+ goto out;
+ }
+
+ /* When in deadline mode, dequeue as soon as possible and change the
+ * txtime from deadline to (now + delta).
+ */
+ if (q->deadline_mode) {
+ timesortedlist_erase(sch, skb, false);
+ skb->tstamp = now;
+ goto out;
+ }
+
+ next = ktime_sub_ns(skb->tstamp, q->delta);
+
+ /* Dequeue only if now is within the [txtime - delta, txtime] range. */
+ if (ktime_after(now, next))
+ timesortedlist_erase(sch, skb, false);
+ else
+ skb = NULL;
+
+out:
+ /* Now we may need to re-arm the qdisc watchdog for the next packet. */
+ reset_watchdog(sch);
+
+ return skb;
+}
+
+static void etf_disable_offload(struct net_device *dev,
+ struct etf_sched_data *q)
+{
+ struct tc_etf_qopt_offload etf = { };
+ const struct net_device_ops *ops;
+ int err;
+
+ if (!q->offload)
+ return;
+
+ ops = dev->netdev_ops;
+ if (!ops->ndo_setup_tc)
+ return;
+
+ etf.queue = q->queue;
+ etf.enable = 0;
+
+ err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_ETF, &etf);
+ if (err < 0)
+ pr_warn("Couldn't disable ETF offload for queue %d\n",
+ etf.queue);
+}
+
+static int etf_enable_offload(struct net_device *dev, struct etf_sched_data *q,
+ struct netlink_ext_ack *extack)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+ struct tc_etf_qopt_offload etf = { };
+ int err;
+
+ if (q->offload)
+ return 0;
+
+ if (!ops->ndo_setup_tc) {
+ NL_SET_ERR_MSG(extack, "Specified device does not support ETF offload");
+ return -EOPNOTSUPP;
+ }
+
+ etf.queue = q->queue;
+ etf.enable = 1;
+
+ err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_ETF, &etf);
+ if (err < 0) {
+ NL_SET_ERR_MSG(extack, "Specified device failed to setup ETF hardware offload");
+ return err;
+ }
+
+ return 0;
+}
+
+static int etf_init(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct etf_sched_data *q = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ struct nlattr *tb[TCA_ETF_MAX + 1];
+ struct tc_etf_qopt *qopt;
+ int err;
+
+ if (!opt) {
+ NL_SET_ERR_MSG(extack,
+ "Missing ETF qdisc options which are mandatory");
+ return -EINVAL;
+ }
+
+ err = nla_parse_nested(tb, TCA_ETF_MAX, opt, etf_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (!tb[TCA_ETF_PARMS]) {
+ NL_SET_ERR_MSG(extack, "Missing mandatory ETF parameters");
+ return -EINVAL;
+ }
+
+ qopt = nla_data(tb[TCA_ETF_PARMS]);
+
+ pr_debug("delta %d clockid %d offload %s deadline %s\n",
+ qopt->delta, qopt->clockid,
+ OFFLOAD_IS_ON(qopt) ? "on" : "off",
+ DEADLINE_MODE_IS_ON(qopt) ? "on" : "off");
+
+ err = validate_input_params(qopt, extack);
+ if (err < 0)
+ return err;
+
+ q->queue = sch->dev_queue - netdev_get_tx_queue(dev, 0);
+
+ if (OFFLOAD_IS_ON(qopt)) {
+ err = etf_enable_offload(dev, q, extack);
+ if (err < 0)
+ return err;
+ }
+
+ /* Everything went OK, save the parameters used. */
+ q->delta = qopt->delta;
+ q->clockid = qopt->clockid;
+ q->offload = OFFLOAD_IS_ON(qopt);
+ q->deadline_mode = DEADLINE_MODE_IS_ON(qopt);
+
+ switch (q->clockid) {
+ case CLOCK_REALTIME:
+ q->get_time = ktime_get_real;
+ break;
+ case CLOCK_MONOTONIC:
+ q->get_time = ktime_get;
+ break;
+ case CLOCK_BOOTTIME:
+ q->get_time = ktime_get_boottime;
+ break;
+ case CLOCK_TAI:
+ q->get_time = ktime_get_clocktai;
+ break;
+ default:
+ NL_SET_ERR_MSG(extack, "Clockid is not supported");
+ return -ENOTSUPP;
+ }
+
+ qdisc_watchdog_init_clockid(&q->watchdog, sch, q->clockid);
+
+ return 0;
+}
+
+static void timesortedlist_clear(struct Qdisc *sch)
+{
+ struct etf_sched_data *q = qdisc_priv(sch);
+ struct rb_node *p = rb_first(&q->head);
+
+ while (p) {
+ struct sk_buff *skb = rb_to_skb(p);
+
+ p = rb_next(p);
+
+ rb_erase(&skb->rbnode, &q->head);
+ rtnl_kfree_skbs(skb, skb);
+ sch->q.qlen--;
+ }
+}
+
+static void etf_reset(struct Qdisc *sch)
+{
+ struct etf_sched_data *q = qdisc_priv(sch);
+
+ /* Only cancel watchdog if it's been initialized. */
+ if (q->watchdog.qdisc == sch)
+ qdisc_watchdog_cancel(&q->watchdog);
+
+ /* No matter which mode we are on, it's safe to clear both lists. */
+ timesortedlist_clear(sch);
+ __qdisc_reset_queue(&sch->q);
+
+ sch->qstats.backlog = 0;
+ sch->q.qlen = 0;
+
+ q->last = 0;
+}
+
+static void etf_destroy(struct Qdisc *sch)
+{
+ struct etf_sched_data *q = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+
+ /* Only cancel watchdog if it's been initialized. */
+ if (q->watchdog.qdisc == sch)
+ qdisc_watchdog_cancel(&q->watchdog);
+
+ etf_disable_offload(dev, q);
+}
+
+static int etf_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct etf_sched_data *q = qdisc_priv(sch);
+ struct tc_etf_qopt opt = { };
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, TCA_OPTIONS);
+ if (!nest)
+ goto nla_put_failure;
+
+ opt.delta = q->delta;
+ opt.clockid = q->clockid;
+ if (q->offload)
+ opt.flags |= TC_ETF_OFFLOAD_ON;
+
+ if (q->deadline_mode)
+ opt.flags |= TC_ETF_DEADLINE_MODE_ON;
+
+ if (nla_put(skb, TCA_ETF_PARMS, sizeof(opt), &opt))
+ goto nla_put_failure;
+
+ return nla_nest_end(skb, nest);
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -1;
+}
+
+static struct Qdisc_ops etf_qdisc_ops __read_mostly = {
+ .id = "etf",
+ .priv_size = sizeof(struct etf_sched_data),
+ .enqueue = etf_enqueue_timesortedlist,
+ .dequeue = etf_dequeue_timesortedlist,
+ .peek = etf_peek_timesortedlist,
+ .init = etf_init,
+ .reset = etf_reset,
+ .destroy = etf_destroy,
+ .dump = etf_dump,
+ .owner = THIS_MODULE,
+};
+
+static int __init etf_module_init(void)
+{
+ return register_qdisc(&etf_qdisc_ops);
+}
+
+static void __exit etf_module_exit(void)
+{
+ unregister_qdisc(&etf_qdisc_ops);
+}
+module_init(etf_module_init)
+module_exit(etf_module_exit)
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 2a4ab7caf553..43c4bfe625a9 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -126,7 +126,6 @@ struct htb_class {
union {
struct htb_class_leaf {
- struct list_head drop_list;
int deficit[TC_HTB_MAXDEPTH];
struct Qdisc *q;
} leaf;
@@ -171,7 +170,6 @@ struct htb_sched {
struct qdisc_watchdog watchdog;
s64 now; /* cached dequeue time */
- struct list_head drops[TC_HTB_NUMPRIO];/* active leaves (for drops) */
/* time of nearest event per level (row) */
s64 near_ev_cache[TC_HTB_MAXDEPTH];
@@ -562,8 +560,6 @@ static inline void htb_activate(struct htb_sched *q, struct htb_class *cl)
if (!cl->prio_activity) {
cl->prio_activity = 1 << cl->prio;
htb_activate_prios(q, cl);
- list_add_tail(&cl->un.leaf.drop_list,
- q->drops + cl->prio);
}
}
@@ -579,7 +575,6 @@ static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl)
htb_deactivate_prios(q, cl);
cl->prio_activity = 0;
- list_del_init(&cl->un.leaf.drop_list);
}
static void htb_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch,
@@ -981,7 +976,6 @@ static void htb_reset(struct Qdisc *sch)
else {
if (cl->un.leaf.q)
qdisc_reset(cl->un.leaf.q);
- INIT_LIST_HEAD(&cl->un.leaf.drop_list);
}
cl->prio_activity = 0;
cl->cmode = HTB_CAN_SEND;
@@ -993,8 +987,6 @@ static void htb_reset(struct Qdisc *sch)
sch->qstats.backlog = 0;
memset(q->hlevel, 0, sizeof(q->hlevel));
memset(q->row_mask, 0, sizeof(q->row_mask));
- for (i = 0; i < TC_HTB_NUMPRIO; i++)
- INIT_LIST_HEAD(q->drops + i);
}
static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = {
@@ -1024,7 +1016,6 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt,
struct nlattr *tb[TCA_HTB_MAX + 1];
struct tc_htb_glob *gopt;
int err;
- int i;
qdisc_watchdog_init(&q->watchdog, sch);
INIT_WORK(&q->work, htb_work_func);
@@ -1050,8 +1041,6 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt,
err = qdisc_class_hash_init(&q->clhash);
if (err < 0)
return err;
- for (i = 0; i < TC_HTB_NUMPRIO; i++)
- INIT_LIST_HEAD(q->drops + i);
qdisc_skb_head_init(&q->direct_queue);
@@ -1224,7 +1213,6 @@ static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl,
parent->level = 0;
memset(&parent->un.inner, 0, sizeof(parent->un.inner));
- INIT_LIST_HEAD(&parent->un.leaf.drop_list);
parent->un.leaf.q = new_q ? new_q : &noop_qdisc;
parent->tokens = parent->buffer;
parent->ctokens = parent->cbuffer;
@@ -1418,7 +1406,6 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
}
cl->children = 0;
- INIT_LIST_HEAD(&cl->un.leaf.drop_list);
RB_CLEAR_NODE(&cl->pq_node);
for (prio = 0; prio < TC_HTB_NUMPRIO; prio++)
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 7d6801fc5340..ad18a2052416 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -68,6 +68,11 @@
Fabio Ludovici <fabio.ludovici at yahoo.it>
*/
+struct disttable {
+ u32 size;
+ s16 table[0];
+};
+
struct netem_sched_data {
/* internal t(ime)fifo qdisc uses t_root and sch->limit */
struct rb_root t_root;
@@ -99,10 +104,7 @@ struct netem_sched_data {
u32 rho;
} delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
- struct disttable {
- u32 size;
- s16 table[0];
- } *delay_dist;
+ struct disttable *delay_dist;
enum {
CLG_RANDOM,
@@ -142,6 +144,7 @@ struct netem_sched_data {
s32 bytes_left;
} slot;
+ struct disttable *slot_dist;
};
/* Time stamp put into socket buffer control block
@@ -180,7 +183,7 @@ static u32 get_crandom(struct crndstate *state)
u64 value, rho;
unsigned long answer;
- if (state->rho == 0) /* no correlation */
+ if (!state || state->rho == 0) /* no correlation */
return prandom_u32();
value = prandom_u32();
@@ -601,10 +604,19 @@ finish_segs:
static void get_slot_next(struct netem_sched_data *q, u64 now)
{
- q->slot.slot_next = now + q->slot_config.min_delay +
- (prandom_u32() *
- (q->slot_config.max_delay -
- q->slot_config.min_delay) >> 32);
+ s64 next_delay;
+
+ if (!q->slot_dist)
+ next_delay = q->slot_config.min_delay +
+ (prandom_u32() *
+ (q->slot_config.max_delay -
+ q->slot_config.min_delay) >> 32);
+ else
+ next_delay = tabledist(q->slot_config.dist_delay,
+ (s32)(q->slot_config.dist_jitter),
+ NULL, q->slot_dist);
+
+ q->slot.slot_next = now + next_delay;
q->slot.packets_left = q->slot_config.max_packets;
q->slot.bytes_left = q->slot_config.max_bytes;
}
@@ -721,9 +733,9 @@ static void dist_free(struct disttable *d)
* signed 16 bit values.
*/
-static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr)
+static int get_dist_table(struct Qdisc *sch, struct disttable **tbl,
+ const struct nlattr *attr)
{
- struct netem_sched_data *q = qdisc_priv(sch);
size_t n = nla_len(attr)/sizeof(__s16);
const __s16 *data = nla_data(attr);
spinlock_t *root_lock;
@@ -744,7 +756,7 @@ static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr)
root_lock = qdisc_root_sleeping_lock(sch);
spin_lock_bh(root_lock);
- swap(q->delay_dist, d);
+ swap(*tbl, d);
spin_unlock_bh(root_lock);
dist_free(d);
@@ -762,7 +774,8 @@ static void get_slot(struct netem_sched_data *q, const struct nlattr *attr)
q->slot_config.max_bytes = INT_MAX;
q->slot.packets_left = q->slot_config.max_packets;
q->slot.bytes_left = q->slot_config.max_bytes;
- if (q->slot_config.min_delay | q->slot_config.max_delay)
+ if (q->slot_config.min_delay | q->slot_config.max_delay |
+ q->slot_config.dist_jitter)
q->slot.slot_next = ktime_get_ns();
else
q->slot.slot_next = 0;
@@ -926,16 +939,17 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt,
}
if (tb[TCA_NETEM_DELAY_DIST]) {
- ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST]);
- if (ret) {
- /* recover clg and loss_model, in case of
- * q->clg and q->loss_model were modified
- * in get_loss_clg()
- */
- q->clg = old_clg;
- q->loss_model = old_loss_model;
- return ret;
- }
+ ret = get_dist_table(sch, &q->delay_dist,
+ tb[TCA_NETEM_DELAY_DIST]);
+ if (ret)
+ goto get_table_failure;
+ }
+
+ if (tb[TCA_NETEM_SLOT_DIST]) {
+ ret = get_dist_table(sch, &q->slot_dist,
+ tb[TCA_NETEM_SLOT_DIST]);
+ if (ret)
+ goto get_table_failure;
}
sch->limit = qopt->limit;
@@ -983,6 +997,15 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt,
get_slot(q, tb[TCA_NETEM_SLOT]);
return ret;
+
+get_table_failure:
+ /* recover clg and loss_model, in case of
+ * q->clg and q->loss_model were modified
+ * in get_loss_clg()
+ */
+ q->clg = old_clg;
+ q->loss_model = old_loss_model;
+ return ret;
}
static int netem_init(struct Qdisc *sch, struct nlattr *opt,
@@ -1011,6 +1034,7 @@ static void netem_destroy(struct Qdisc *sch)
if (q->qdisc)
qdisc_destroy(q->qdisc);
dist_free(q->delay_dist);
+ dist_free(q->slot_dist);
}
static int dump_loss_model(const struct netem_sched_data *q,
@@ -1127,7 +1151,8 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
if (dump_loss_model(q, skb) != 0)
goto nla_put_failure;
- if (q->slot_config.min_delay | q->slot_config.max_delay) {
+ if (q->slot_config.min_delay | q->slot_config.max_delay |
+ q->slot_config.dist_jitter) {
slot = q->slot_config;
if (slot.max_packets == INT_MAX)
slot.max_packets = 0;
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 5d5a16204d50..297d9cf960b9 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -115,6 +115,9 @@ static struct sctp_association *sctp_association_init(
/* Initialize path max retrans value. */
asoc->pathmaxrxt = sp->pathmaxrxt;
+ asoc->flowlabel = sp->flowlabel;
+ asoc->dscp = sp->dscp;
+
/* Initialize default path MTU. */
asoc->pathmtu = sp->pathmtu;
@@ -647,6 +650,18 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
peer->sackdelay = asoc->sackdelay;
peer->sackfreq = asoc->sackfreq;
+ if (addr->sa.sa_family == AF_INET6) {
+ __be32 info = addr->v6.sin6_flowinfo;
+
+ if (info) {
+ peer->flowlabel = ntohl(info & IPV6_FLOWLABEL_MASK);
+ peer->flowlabel |= SCTP_FLOWLABEL_SET_MASK;
+ } else {
+ peer->flowlabel = asoc->flowlabel;
+ }
+ }
+ peer->dscp = asoc->dscp;
+
/* Enable/disable heartbeat, SACK delay, and path MTU discovery
* based on association setting.
*/
diff --git a/net/sctp/input.c b/net/sctp/input.c
index ba8a6e6c36fa..9bbc5f92c941 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -56,6 +56,7 @@
#include <net/sctp/sm.h>
#include <net/sctp/checksum.h>
#include <net/net_namespace.h>
+#include <linux/rhashtable.h>
/* Forward declarations for internal helpers. */
static int sctp_rcv_ootb(struct sk_buff *);
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 0cd2e764f47f..fc6c5e4bffa5 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -209,12 +209,17 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport)
struct sock *sk = skb->sk;
struct ipv6_pinfo *np = inet6_sk(sk);
struct flowi6 *fl6 = &transport->fl.u.ip6;
+ __u8 tclass = np->tclass;
int res;
pr_debug("%s: skb:%p, len:%d, src:%pI6 dst:%pI6\n", __func__, skb,
skb->len, &fl6->saddr, &fl6->daddr);
- IP6_ECN_flow_xmit(sk, fl6->flowlabel);
+ if (transport->dscp & SCTP_DSCP_SET_MASK)
+ tclass = transport->dscp & SCTP_DSCP_VAL_MASK;
+
+ if (INET_ECN_is_capable(tclass))
+ IP6_ECN_flow_xmit(sk, fl6->flowlabel);
if (!(transport->param_flags & SPP_PMTUD_ENABLE))
skb->ignore_df = 1;
@@ -223,7 +228,7 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport)
rcu_read_lock();
res = ip6_xmit(sk, skb, fl6, sk->sk_mark, rcu_dereference(np->opt),
- np->tclass);
+ tclass);
rcu_read_unlock();
return res;
}
@@ -254,6 +259,17 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
fl6->flowi6_oif = daddr->v6.sin6_scope_id;
else if (asoc)
fl6->flowi6_oif = asoc->base.sk->sk_bound_dev_if;
+ if (t->flowlabel & SCTP_FLOWLABEL_SET_MASK)
+ fl6->flowlabel = htonl(t->flowlabel & SCTP_FLOWLABEL_VAL_MASK);
+
+ if (np->sndflow && (fl6->flowlabel & IPV6_FLOWLABEL_MASK)) {
+ struct ip6_flowlabel *flowlabel;
+
+ flowlabel = fl6_sock_lookup(sk, fl6->flowlabel);
+ if (!flowlabel)
+ goto out;
+ fl6_sock_release(flowlabel);
+ }
pr_debug("%s: dst=%pI6 ", __func__, &fl6->daddr);
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 67f73d3a1356..e948db29ab53 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -426,13 +426,16 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
struct dst_entry *dst = NULL;
union sctp_addr *daddr = &t->ipaddr;
union sctp_addr dst_saddr;
+ __u8 tos = inet_sk(sk)->tos;
+ if (t->dscp & SCTP_DSCP_SET_MASK)
+ tos = t->dscp & SCTP_DSCP_VAL_MASK;
memset(fl4, 0x0, sizeof(struct flowi4));
fl4->daddr = daddr->v4.sin_addr.s_addr;
fl4->fl4_dport = daddr->v4.sin_port;
fl4->flowi4_proto = IPPROTO_SCTP;
if (asoc) {
- fl4->flowi4_tos = RT_CONN_FLAGS(asoc->base.sk);
+ fl4->flowi4_tos = RT_CONN_FLAGS_TOS(asoc->base.sk, tos);
fl4->flowi4_oif = asoc->base.sk->sk_bound_dev_if;
fl4->fl4_sport = htons(asoc->base.bind_addr.port);
}
@@ -495,7 +498,7 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
fl4->fl4_sport = laddr->a.v4.sin_port;
flowi4_update_output(fl4,
asoc->base.sk->sk_bound_dev_if,
- RT_CONN_FLAGS(asoc->base.sk),
+ RT_CONN_FLAGS_TOS(asoc->base.sk, tos),
daddr->v4.sin_addr.s_addr,
laddr->a.v4.sin_addr.s_addr);
@@ -971,16 +974,21 @@ static inline int sctp_v4_xmit(struct sk_buff *skb,
struct sctp_transport *transport)
{
struct inet_sock *inet = inet_sk(skb->sk);
+ __u8 dscp = inet->tos;
pr_debug("%s: skb:%p, len:%d, src:%pI4, dst:%pI4\n", __func__, skb,
- skb->len, &transport->fl.u.ip4.saddr, &transport->fl.u.ip4.daddr);
+ skb->len, &transport->fl.u.ip4.saddr,
+ &transport->fl.u.ip4.daddr);
+
+ if (transport->dscp & SCTP_DSCP_SET_MASK)
+ dscp = transport->dscp & SCTP_DSCP_VAL_MASK;
inet->pmtudisc = transport->param_flags & SPP_PMTUD_ENABLE ?
IP_PMTUDISC_DO : IP_PMTUDISC_DONT;
SCTP_INC_STATS(sock_net(&inet->sk), SCTP_MIB_OUTSCTPPACKS);
- return ip_queue_xmit(&inet->sk, skb, &transport->fl);
+ return __ip_queue_xmit(&inet->sk, skb, &transport->fl, dscp);
}
static struct sctp_af sctp_af_inet;
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index ce620e878538..502c0d7cb105 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -66,6 +66,7 @@
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/compat.h>
+#include <linux/rhashtable.h>
#include <net/ip.h>
#include <net/icmp.h>
@@ -1696,6 +1697,7 @@ static int sctp_sendmsg_new_asoc(struct sock *sk, __u16 sflags,
struct sctp_association *asoc;
enum sctp_scope scope;
struct cmsghdr *cmsg;
+ __be32 flowinfo = 0;
struct sctp_af *af;
int err;
@@ -1780,6 +1782,9 @@ static int sctp_sendmsg_new_asoc(struct sock *sk, __u16 sflags,
if (!cmsgs->addrs_msg)
return 0;
+ if (daddr->sa.sa_family == AF_INET6)
+ flowinfo = daddr->v6.sin6_flowinfo;
+
/* sendv addr list parse */
for_each_cmsghdr(cmsg, cmsgs->addrs_msg) {
struct sctp_transport *transport;
@@ -1812,6 +1817,7 @@ static int sctp_sendmsg_new_asoc(struct sock *sk, __u16 sflags,
}
dlen = sizeof(struct in6_addr);
+ daddr->v6.sin6_flowinfo = flowinfo;
daddr->v6.sin6_family = AF_INET6;
daddr->v6.sin6_port = htons(asoc->peer.port);
memcpy(&daddr->v6.sin6_addr, CMSG_DATA(cmsg), dlen);
@@ -2392,6 +2398,8 @@ static int sctp_setsockopt_autoclose(struct sock *sk, char __user *optval,
* uint32_t spp_pathmtu;
* uint32_t spp_sackdelay;
* uint32_t spp_flags;
+ * uint32_t spp_ipv6_flowlabel;
+ * uint8_t spp_dscp;
* };
*
* spp_assoc_id - (one-to-many style socket) This is filled in the
@@ -2471,6 +2479,45 @@ static int sctp_setsockopt_autoclose(struct sock *sk, char __user *optval,
* also that this field is mutually exclusive to
* SPP_SACKDELAY_ENABLE, setting both will have undefined
* results.
+ *
+ * SPP_IPV6_FLOWLABEL: Setting this flag enables the
+ * setting of the IPV6 flow label value. The value is
+ * contained in the spp_ipv6_flowlabel field.
+ * Upon retrieval, this flag will be set to indicate that
+ * the spp_ipv6_flowlabel field has a valid value returned.
+ * If a specific destination address is set (in the
+ * spp_address field), then the value returned is that of
+ * the address. If just an association is specified (and
+ * no address), then the association's default flow label
+ * is returned. If neither an association nor a destination
+ * is specified, then the socket's default flow label is
+ * returned. For non-IPv6 sockets, this flag will be left
+ * cleared.
+ *
+ * SPP_DSCP: Setting this flag enables the setting of the
+ * Differentiated Services Code Point (DSCP) value
+ * associated with either the association or a specific
+ * address. The value is obtained in the spp_dscp field.
+ * Upon retrieval, this flag will be set to indicate that
+ * the spp_dscp field has a valid value returned. If a
+ * specific destination address is set when called (in the
+ * spp_address field), then that specific destination
+ * address's DSCP value is returned. If just an association
+ * is specified, then the association's default DSCP is
+ * returned. If neither an association nor a destination is
+ * specified, then the socket's default DSCP is returned.
+ *
+ * spp_ipv6_flowlabel
+ * - This field is used in conjunction with the
+ * SPP_IPV6_FLOWLABEL flag and contains the IPv6 flow label.
+ * The 20 least significant bits are used for the flow
+ * label. This setting has precedence over any IPv6-layer
+ * setting.
+ *
+ * spp_dscp - This field is used in conjunction with the SPP_DSCP flag
+ * and contains the DSCP. The 6 most significant bits are
+ * used for the DSCP. This setting has precedence over any
+ * IPv4- or IPv6- layer setting.
*/
static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params,
struct sctp_transport *trans,
@@ -2610,6 +2657,51 @@ static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params,
}
}
+ if (params->spp_flags & SPP_IPV6_FLOWLABEL) {
+ if (trans && trans->ipaddr.sa.sa_family == AF_INET6) {
+ trans->flowlabel = params->spp_ipv6_flowlabel &
+ SCTP_FLOWLABEL_VAL_MASK;
+ trans->flowlabel |= SCTP_FLOWLABEL_SET_MASK;
+ } else if (asoc) {
+ list_for_each_entry(trans,
+ &asoc->peer.transport_addr_list,
+ transports) {
+ if (trans->ipaddr.sa.sa_family != AF_INET6)
+ continue;
+ trans->flowlabel = params->spp_ipv6_flowlabel &
+ SCTP_FLOWLABEL_VAL_MASK;
+ trans->flowlabel |= SCTP_FLOWLABEL_SET_MASK;
+ }
+ asoc->flowlabel = params->spp_ipv6_flowlabel &
+ SCTP_FLOWLABEL_VAL_MASK;
+ asoc->flowlabel |= SCTP_FLOWLABEL_SET_MASK;
+ } else if (sctp_opt2sk(sp)->sk_family == AF_INET6) {
+ sp->flowlabel = params->spp_ipv6_flowlabel &
+ SCTP_FLOWLABEL_VAL_MASK;
+ sp->flowlabel |= SCTP_FLOWLABEL_SET_MASK;
+ }
+ }
+
+ if (params->spp_flags & SPP_DSCP) {
+ if (trans) {
+ trans->dscp = params->spp_dscp & SCTP_DSCP_VAL_MASK;
+ trans->dscp |= SCTP_DSCP_SET_MASK;
+ } else if (asoc) {
+ list_for_each_entry(trans,
+ &asoc->peer.transport_addr_list,
+ transports) {
+ trans->dscp = params->spp_dscp &
+ SCTP_DSCP_VAL_MASK;
+ trans->dscp |= SCTP_DSCP_SET_MASK;
+ }
+ asoc->dscp = params->spp_dscp & SCTP_DSCP_VAL_MASK;
+ asoc->dscp |= SCTP_DSCP_SET_MASK;
+ } else {
+ sp->dscp = params->spp_dscp & SCTP_DSCP_VAL_MASK;
+ sp->dscp |= SCTP_DSCP_SET_MASK;
+ }
+ }
+
return 0;
}
@@ -2624,11 +2716,18 @@ static int sctp_setsockopt_peer_addr_params(struct sock *sk,
int error;
int hb_change, pmtud_change, sackdelay_change;
- if (optlen != sizeof(struct sctp_paddrparams))
+ if (optlen == sizeof(params)) {
+ if (copy_from_user(&params, optval, optlen))
+ return -EFAULT;
+ } else if (optlen == ALIGN(offsetof(struct sctp_paddrparams,
+ spp_ipv6_flowlabel), 4)) {
+ if (copy_from_user(&params, optval, optlen))
+ return -EFAULT;
+ if (params.spp_flags & (SPP_DSCP | SPP_IPV6_FLOWLABEL))
+ return -EINVAL;
+ } else {
return -EINVAL;
-
- if (copy_from_user(&params, optval, optlen))
- return -EFAULT;
+ }
/* Validate flags and value parameters. */
hb_change = params.spp_flags & SPP_HB;
@@ -4169,6 +4268,28 @@ out:
return retval;
}
+static int sctp_setsockopt_reuse_port(struct sock *sk, char __user *optval,
+ unsigned int optlen)
+{
+ int val;
+
+ if (!sctp_style(sk, TCP))
+ return -EOPNOTSUPP;
+
+ if (sctp_sk(sk)->ep->base.bind_addr.port)
+ return -EFAULT;
+
+ if (optlen < sizeof(int))
+ return -EINVAL;
+
+ if (get_user(val, (int __user *)optval))
+ return -EFAULT;
+
+ sctp_sk(sk)->reuse = !!val;
+
+ return 0;
+}
+
/* API 6.2 setsockopt(), getsockopt()
*
* Applications use setsockopt() and getsockopt() to set or retrieve
@@ -4363,6 +4484,9 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname,
retval = sctp_setsockopt_interleaving_supported(sk, optval,
optlen);
break;
+ case SCTP_REUSE_PORT:
+ retval = sctp_setsockopt_reuse_port(sk, optval, optlen);
+ break;
default:
retval = -ENOPROTOOPT;
break;
@@ -5427,6 +5551,45 @@ out:
* also that this field is mutually exclusive to
* SPP_SACKDELAY_ENABLE, setting both will have undefined
* results.
+ *
+ * SPP_IPV6_FLOWLABEL: Setting this flag enables the
+ * setting of the IPV6 flow label value. The value is
+ * contained in the spp_ipv6_flowlabel field.
+ * Upon retrieval, this flag will be set to indicate that
+ * the spp_ipv6_flowlabel field has a valid value returned.
+ * If a specific destination address is set (in the
+ * spp_address field), then the value returned is that of
+ * the address. If just an association is specified (and
+ * no address), then the association's default flow label
+ * is returned. If neither an association nor a destination
+ * is specified, then the socket's default flow label is
+ * returned. For non-IPv6 sockets, this flag will be left
+ * cleared.
+ *
+ * SPP_DSCP: Setting this flag enables the setting of the
+ * Differentiated Services Code Point (DSCP) value
+ * associated with either the association or a specific
+ * address. The value is obtained in the spp_dscp field.
+ * Upon retrieval, this flag will be set to indicate that
+ * the spp_dscp field has a valid value returned. If a
+ * specific destination address is set when called (in the
+ * spp_address field), then that specific destination
+ * address's DSCP value is returned. If just an association
+ * is specified, then the association's default DSCP is
+ * returned. If neither an association nor a destination is
+ * specified, then the socket's default DSCP is returned.
+ *
+ * spp_ipv6_flowlabel
+ * - This field is used in conjunction with the
+ * SPP_IPV6_FLOWLABEL flag and contains the IPv6 flow label.
+ * The 20 least significant bits are used for the flow
+ * label. This setting has precedence over any IPv6-layer
+ * setting.
+ *
+ * spp_dscp - This field is used in conjunction with the SPP_DSCP flag
+ * and contains the DSCP. The 6 most significant bits are
+ * used for the DSCP. This setting has precedence over any
+ * IPv4- or IPv6- layer setting.
*/
static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len,
char __user *optval, int __user *optlen)
@@ -5436,9 +5599,15 @@ static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len,
struct sctp_association *asoc = NULL;
struct sctp_sock *sp = sctp_sk(sk);
- if (len < sizeof(struct sctp_paddrparams))
+ if (len >= sizeof(params))
+ len = sizeof(params);
+ else if (len >= ALIGN(offsetof(struct sctp_paddrparams,
+ spp_ipv6_flowlabel), 4))
+ len = ALIGN(offsetof(struct sctp_paddrparams,
+ spp_ipv6_flowlabel), 4);
+ else
return -EINVAL;
- len = sizeof(struct sctp_paddrparams);
+
if (copy_from_user(&params, optval, len))
return -EFAULT;
@@ -5473,6 +5642,15 @@ static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len,
/*draft-11 doesn't say what to return in spp_flags*/
params.spp_flags = trans->param_flags;
+ if (trans->flowlabel & SCTP_FLOWLABEL_SET_MASK) {
+ params.spp_ipv6_flowlabel = trans->flowlabel &
+ SCTP_FLOWLABEL_VAL_MASK;
+ params.spp_flags |= SPP_IPV6_FLOWLABEL;
+ }
+ if (trans->dscp & SCTP_DSCP_SET_MASK) {
+ params.spp_dscp = trans->dscp & SCTP_DSCP_VAL_MASK;
+ params.spp_flags |= SPP_DSCP;
+ }
} else if (asoc) {
/* Fetch association values. */
params.spp_hbinterval = jiffies_to_msecs(asoc->hbinterval);
@@ -5482,6 +5660,15 @@ static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len,
/*draft-11 doesn't say what to return in spp_flags*/
params.spp_flags = asoc->param_flags;
+ if (asoc->flowlabel & SCTP_FLOWLABEL_SET_MASK) {
+ params.spp_ipv6_flowlabel = asoc->flowlabel &
+ SCTP_FLOWLABEL_VAL_MASK;
+ params.spp_flags |= SPP_IPV6_FLOWLABEL;
+ }
+ if (asoc->dscp & SCTP_DSCP_SET_MASK) {
+ params.spp_dscp = asoc->dscp & SCTP_DSCP_VAL_MASK;
+ params.spp_flags |= SPP_DSCP;
+ }
} else {
/* Fetch socket values. */
params.spp_hbinterval = sp->hbinterval;
@@ -5491,6 +5678,15 @@ static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len,
/*draft-11 doesn't say what to return in spp_flags*/
params.spp_flags = sp->param_flags;
+ if (sp->flowlabel & SCTP_FLOWLABEL_SET_MASK) {
+ params.spp_ipv6_flowlabel = sp->flowlabel &
+ SCTP_FLOWLABEL_VAL_MASK;
+ params.spp_flags |= SPP_IPV6_FLOWLABEL;
+ }
+ if (sp->dscp & SCTP_DSCP_SET_MASK) {
+ params.spp_dscp = sp->dscp & SCTP_DSCP_VAL_MASK;
+ params.spp_flags |= SPP_DSCP;
+ }
}
if (copy_to_user(optval, &params, len))
@@ -7196,6 +7392,26 @@ out:
return retval;
}
+static int sctp_getsockopt_reuse_port(struct sock *sk, int len,
+ char __user *optval,
+ int __user *optlen)
+{
+ int val;
+
+ if (len < sizeof(int))
+ return -EINVAL;
+
+ len = sizeof(int);
+ val = sctp_sk(sk)->reuse;
+ if (put_user(len, optlen))
+ return -EFAULT;
+
+ if (copy_to_user(optval, &val, len))
+ return -EFAULT;
+
+ return 0;
+}
+
static int sctp_getsockopt(struct sock *sk, int level, int optname,
char __user *optval, int __user *optlen)
{
@@ -7391,6 +7607,9 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname,
retval = sctp_getsockopt_interleaving_supported(sk, len, optval,
optlen);
break;
+ case SCTP_REUSE_PORT:
+ retval = sctp_getsockopt_reuse_port(sk, len, optval, optlen);
+ break;
default:
retval = -ENOPROTOOPT;
break;
@@ -7428,6 +7647,7 @@ static struct sctp_bind_bucket *sctp_bucket_create(
static long sctp_get_port_local(struct sock *sk, union sctp_addr *addr)
{
+ bool reuse = (sk->sk_reuse || sctp_sk(sk)->reuse);
struct sctp_bind_hashbucket *head; /* hash list */
struct sctp_bind_bucket *pp;
unsigned short snum;
@@ -7500,13 +7720,11 @@ pp_found:
* used by other socket (pp->owner not empty); that other
* socket is going to be sk2.
*/
- int reuse = sk->sk_reuse;
struct sock *sk2;
pr_debug("%s: found a possible match\n", __func__);
- if (pp->fastreuse && sk->sk_reuse &&
- sk->sk_state != SCTP_SS_LISTENING)
+ if (pp->fastreuse && reuse && sk->sk_state != SCTP_SS_LISTENING)
goto success;
/* Run through the list of sockets bound to the port
@@ -7524,7 +7742,7 @@ pp_found:
ep2 = sctp_sk(sk2)->ep;
if (sk == sk2 ||
- (reuse && sk2->sk_reuse &&
+ (reuse && (sk2->sk_reuse || sctp_sk(sk2)->reuse) &&
sk2->sk_state != SCTP_SS_LISTENING))
continue;
@@ -7548,12 +7766,12 @@ pp_not_found:
* SO_REUSEADDR on this socket -sk-).
*/
if (hlist_empty(&pp->owner)) {
- if (sk->sk_reuse && sk->sk_state != SCTP_SS_LISTENING)
+ if (reuse && sk->sk_state != SCTP_SS_LISTENING)
pp->fastreuse = 1;
else
pp->fastreuse = 0;
} else if (pp->fastreuse &&
- (!sk->sk_reuse || sk->sk_state == SCTP_SS_LISTENING))
+ (!reuse || sk->sk_state == SCTP_SS_LISTENING))
pp->fastreuse = 0;
/* We are set, so fill up all the data in the hash table
@@ -7684,7 +7902,7 @@ int sctp_inet_listen(struct socket *sock, int backlog)
err = 0;
sctp_unhash_endpoint(ep);
sk->sk_state = SCTP_SS_CLOSED;
- if (sk->sk_reuse)
+ if (sk->sk_reuse || sctp_sk(sk)->reuse)
sctp_sk(sk)->bind_hash->fastreuse = 1;
goto out;
}
@@ -8551,6 +8769,7 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk,
newsk->sk_no_check_tx = sk->sk_no_check_tx;
newsk->sk_no_check_rx = sk->sk_no_check_rx;
newsk->sk_reuse = sk->sk_reuse;
+ sctp_sk(newsk)->reuse = sp->reuse;
newsk->sk_shutdown = sk->sk_shutdown;
newsk->sk_destruct = sctp_destruct_sock;
diff --git a/net/smc/Makefile b/net/smc/Makefile
index 188104654b54..4df96b4b8130 100644
--- a/net/smc/Makefile
+++ b/net/smc/Makefile
@@ -1,4 +1,4 @@
obj-$(CONFIG_SMC) += smc.o
obj-$(CONFIG_SMC_DIAG) += smc_diag.o
smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o
-smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o
+smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 05e4ffe5aabd..143b2220c0c8 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -23,6 +23,7 @@
#include <linux/workqueue.h>
#include <linux/in.h>
#include <linux/sched/signal.h>
+#include <linux/if_vlan.h>
#include <net/sock.h>
#include <net/tcp.h>
@@ -35,6 +36,7 @@
#include "smc_cdc.h"
#include "smc_core.h"
#include "smc_ib.h"
+#include "smc_ism.h"
#include "smc_pnet.h"
#include "smc_tx.h"
#include "smc_rx.h"
@@ -381,8 +383,8 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc)
return 0;
}
-static void smc_conn_save_peer_info(struct smc_sock *smc,
- struct smc_clc_msg_accept_confirm *clc)
+static void smcr_conn_save_peer_info(struct smc_sock *smc,
+ struct smc_clc_msg_accept_confirm *clc)
{
int bufsize = smc_uncompress_bufsize(clc->rmbe_size);
@@ -393,6 +395,28 @@ static void smc_conn_save_peer_info(struct smc_sock *smc,
smc->conn.tx_off = bufsize * (smc->conn.peer_rmbe_idx - 1);
}
+static void smcd_conn_save_peer_info(struct smc_sock *smc,
+ struct smc_clc_msg_accept_confirm *clc)
+{
+ int bufsize = smc_uncompress_bufsize(clc->dmbe_size);
+
+ smc->conn.peer_rmbe_idx = clc->dmbe_idx;
+ smc->conn.peer_token = clc->token;
+ /* msg header takes up space in the buffer */
+ smc->conn.peer_rmbe_size = bufsize - sizeof(struct smcd_cdc_msg);
+ atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
+ smc->conn.tx_off = bufsize * smc->conn.peer_rmbe_idx;
+}
+
+static void smc_conn_save_peer_info(struct smc_sock *smc,
+ struct smc_clc_msg_accept_confirm *clc)
+{
+ if (smc->conn.lgr->is_smcd)
+ smcd_conn_save_peer_info(smc, clc);
+ else
+ smcr_conn_save_peer_info(smc, clc);
+}
+
static void smc_link_save_peer_info(struct smc_link *link,
struct smc_clc_msg_accept_confirm *clc)
{
@@ -463,15 +487,51 @@ static int smc_check_rdma(struct smc_sock *smc, struct smc_ib_device **ibdev,
return reason_code;
}
+/* check if there is an ISM device available for this connection. */
+/* called for connect and listen */
+static int smc_check_ism(struct smc_sock *smc, struct smcd_dev **ismdev)
+{
+ /* Find ISM device with same PNETID as connecting interface */
+ smc_pnet_find_ism_resource(smc->clcsock->sk, ismdev);
+ if (!(*ismdev))
+ return SMC_CLC_DECL_CNFERR; /* configuration error */
+ return 0;
+}
+
+/* Check for VLAN ID and register it on ISM device just for CLC handshake */
+static int smc_connect_ism_vlan_setup(struct smc_sock *smc,
+ struct smcd_dev *ismdev,
+ unsigned short vlan_id)
+{
+ if (vlan_id && smc_ism_get_vlan(ismdev, vlan_id))
+ return SMC_CLC_DECL_CNFERR;
+ return 0;
+}
+
+/* cleanup temporary VLAN ID registration used for CLC handshake. If ISM is
+ * used, the VLAN ID will be registered again during the connection setup.
+ */
+static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc, bool is_smcd,
+ struct smcd_dev *ismdev,
+ unsigned short vlan_id)
+{
+ if (!is_smcd)
+ return 0;
+ if (vlan_id && smc_ism_put_vlan(ismdev, vlan_id))
+ return SMC_CLC_DECL_CNFERR;
+ return 0;
+}
+
/* CLC handshake during connect */
-static int smc_connect_clc(struct smc_sock *smc,
+static int smc_connect_clc(struct smc_sock *smc, int smc_type,
struct smc_clc_msg_accept_confirm *aclc,
- struct smc_ib_device *ibdev, u8 ibport)
+ struct smc_ib_device *ibdev, u8 ibport,
+ struct smcd_dev *ismdev)
{
int rc = 0;
/* do inband token exchange */
- rc = smc_clc_send_proposal(smc, ibdev, ibport);
+ rc = smc_clc_send_proposal(smc, smc_type, ibdev, ibport, ismdev);
if (rc)
return rc;
/* receive SMC Accept CLC message */
@@ -488,8 +548,8 @@ static int smc_connect_rdma(struct smc_sock *smc,
int reason_code = 0;
mutex_lock(&smc_create_lgr_pending);
- local_contact = smc_conn_create(smc, ibdev, ibport, &aclc->lcl,
- aclc->hdr.flag);
+ local_contact = smc_conn_create(smc, false, aclc->hdr.flag, ibdev,
+ ibport, &aclc->lcl, NULL, 0);
if (local_contact < 0) {
if (local_contact == -ENOMEM)
reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
@@ -504,7 +564,7 @@ static int smc_connect_rdma(struct smc_sock *smc,
smc_conn_save_peer_info(smc, aclc);
/* create send buffer and rmb */
- if (smc_buf_create(smc))
+ if (smc_buf_create(smc, false))
return smc_connect_abort(smc, SMC_CLC_DECL_MEM, local_contact);
if (local_contact == SMC_FIRST_CONTACT)
@@ -551,11 +611,50 @@ static int smc_connect_rdma(struct smc_sock *smc,
return 0;
}
+/* setup for ISM connection of client */
+static int smc_connect_ism(struct smc_sock *smc,
+ struct smc_clc_msg_accept_confirm *aclc,
+ struct smcd_dev *ismdev)
+{
+ int local_contact = SMC_FIRST_CONTACT;
+ int rc = 0;
+
+ mutex_lock(&smc_create_lgr_pending);
+ local_contact = smc_conn_create(smc, true, aclc->hdr.flag, NULL, 0,
+ NULL, ismdev, aclc->gid);
+ if (local_contact < 0)
+ return smc_connect_abort(smc, SMC_CLC_DECL_MEM, 0);
+
+ /* Create send and receive buffers */
+ if (smc_buf_create(smc, true))
+ return smc_connect_abort(smc, SMC_CLC_DECL_MEM, local_contact);
+
+ smc_conn_save_peer_info(smc, aclc);
+ smc_close_init(smc);
+ smc_rx_init(smc);
+ smc_tx_init(smc);
+
+ rc = smc_clc_send_confirm(smc);
+ if (rc)
+ return smc_connect_abort(smc, rc, local_contact);
+ mutex_unlock(&smc_create_lgr_pending);
+
+ smc_copy_sock_settings_to_clc(smc);
+ if (smc->sk.sk_state == SMC_INIT)
+ smc->sk.sk_state = SMC_ACTIVE;
+
+ return 0;
+}
+
/* perform steps before actually connecting */
static int __smc_connect(struct smc_sock *smc)
{
+ bool ism_supported = false, rdma_supported = false;
struct smc_clc_msg_accept_confirm aclc;
struct smc_ib_device *ibdev;
+ struct smcd_dev *ismdev;
+ unsigned short vlan;
+ int smc_type;
int rc = 0;
u8 ibport;
@@ -572,20 +671,52 @@ static int __smc_connect(struct smc_sock *smc)
if (using_ipsec(smc))
return smc_connect_decline_fallback(smc, SMC_CLC_DECL_IPSEC);
- /* check if a RDMA device is available; if not, fall back */
- if (smc_check_rdma(smc, &ibdev, &ibport))
+ /* check for VLAN ID */
+ if (smc_vlan_by_tcpsk(smc->clcsock, &vlan))
+ return smc_connect_decline_fallback(smc, SMC_CLC_DECL_CNFERR);
+
+ /* check if there is an ism device available */
+ if (!smc_check_ism(smc, &ismdev) &&
+ !smc_connect_ism_vlan_setup(smc, ismdev, vlan)) {
+ /* ISM is supported for this connection */
+ ism_supported = true;
+ smc_type = SMC_TYPE_D;
+ }
+
+ /* check if there is a rdma device available */
+ if (!smc_check_rdma(smc, &ibdev, &ibport)) {
+ /* RDMA is supported for this connection */
+ rdma_supported = true;
+ if (ism_supported)
+ smc_type = SMC_TYPE_B; /* both */
+ else
+ smc_type = SMC_TYPE_R; /* only RDMA */
+ }
+
+ /* if neither ISM nor RDMA are supported, fallback */
+ if (!rdma_supported && !ism_supported)
return smc_connect_decline_fallback(smc, SMC_CLC_DECL_CNFERR);
/* perform CLC handshake */
- rc = smc_connect_clc(smc, &aclc, ibdev, ibport);
- if (rc)
+ rc = smc_connect_clc(smc, smc_type, &aclc, ibdev, ibport, ismdev);
+ if (rc) {
+ smc_connect_ism_vlan_cleanup(smc, ism_supported, ismdev, vlan);
return smc_connect_decline_fallback(smc, rc);
+ }
- /* connect using rdma */
- rc = smc_connect_rdma(smc, &aclc, ibdev, ibport);
- if (rc)
+ /* depending on previous steps, connect using rdma or ism */
+ if (rdma_supported && aclc.hdr.path == SMC_TYPE_R)
+ rc = smc_connect_rdma(smc, &aclc, ibdev, ibport);
+ else if (ism_supported && aclc.hdr.path == SMC_TYPE_D)
+ rc = smc_connect_ism(smc, &aclc, ismdev);
+ else
+ rc = SMC_CLC_DECL_CNFERR;
+ if (rc) {
+ smc_connect_ism_vlan_cleanup(smc, ism_supported, ismdev, vlan);
return smc_connect_decline_fallback(smc, rc);
+ }
+ smc_connect_ism_vlan_cleanup(smc, ism_supported, ismdev, vlan);
return 0;
}
@@ -953,7 +1084,8 @@ static int smc_listen_rdma_init(struct smc_sock *new_smc,
int *local_contact)
{
/* allocate connection / link group */
- *local_contact = smc_conn_create(new_smc, ibdev, ibport, &pclc->lcl, 0);
+ *local_contact = smc_conn_create(new_smc, false, 0, ibdev, ibport,
+ &pclc->lcl, NULL, 0);
if (*local_contact < 0) {
if (*local_contact == -ENOMEM)
return SMC_CLC_DECL_MEM;/* insufficient memory*/
@@ -961,12 +1093,50 @@ static int smc_listen_rdma_init(struct smc_sock *new_smc,
}
/* create send buffer and rmb */
- if (smc_buf_create(new_smc))
+ if (smc_buf_create(new_smc, false))
return SMC_CLC_DECL_MEM;
return 0;
}
+/* listen worker: initialize connection and buffers for SMC-D */
+static int smc_listen_ism_init(struct smc_sock *new_smc,
+ struct smc_clc_msg_proposal *pclc,
+ struct smcd_dev *ismdev,
+ int *local_contact)
+{
+ struct smc_clc_msg_smcd *pclc_smcd;
+
+ pclc_smcd = smc_get_clc_msg_smcd(pclc);
+ *local_contact = smc_conn_create(new_smc, true, 0, NULL, 0, NULL,
+ ismdev, pclc_smcd->gid);
+ if (*local_contact < 0) {
+ if (*local_contact == -ENOMEM)
+ return SMC_CLC_DECL_MEM;/* insufficient memory*/
+ return SMC_CLC_DECL_INTERR; /* other error */
+ }
+
+ /* Check if peer can be reached via ISM device */
+ if (smc_ism_cantalk(new_smc->conn.lgr->peer_gid,
+ new_smc->conn.lgr->vlan_id,
+ new_smc->conn.lgr->smcd)) {
+ if (*local_contact == SMC_FIRST_CONTACT)
+ smc_lgr_forget(new_smc->conn.lgr);
+ smc_conn_free(&new_smc->conn);
+ return SMC_CLC_DECL_CNFERR;
+ }
+
+ /* Create send and receive buffers */
+ if (smc_buf_create(new_smc, true)) {
+ if (*local_contact == SMC_FIRST_CONTACT)
+ smc_lgr_forget(new_smc->conn.lgr);
+ smc_conn_free(&new_smc->conn);
+ return SMC_CLC_DECL_MEM;
+ }
+
+ return 0;
+}
+
/* listen worker: register buffers */
static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact)
{
@@ -1025,6 +1195,8 @@ static void smc_listen_work(struct work_struct *work)
struct smc_clc_msg_accept_confirm cclc;
struct smc_clc_msg_proposal *pclc;
struct smc_ib_device *ibdev;
+ bool ism_supported = false;
+ struct smcd_dev *ismdev;
u8 buf[SMC_CLC_MAX_LEN];
int local_contact = 0;
int reason_code = 0;
@@ -1065,12 +1237,21 @@ static void smc_listen_work(struct work_struct *work)
smc_rx_init(new_smc);
smc_tx_init(new_smc);
+ /* check if ISM is available */
+ if ((pclc->hdr.path == SMC_TYPE_D || pclc->hdr.path == SMC_TYPE_B) &&
+ !smc_check_ism(new_smc, &ismdev) &&
+ !smc_listen_ism_init(new_smc, pclc, ismdev, &local_contact)) {
+ ism_supported = true;
+ }
+
/* check if RDMA is available */
- if (smc_check_rdma(new_smc, &ibdev, &ibport) ||
- smc_listen_rdma_check(new_smc, pclc) ||
- smc_listen_rdma_init(new_smc, pclc, ibdev, ibport,
- &local_contact) ||
- smc_listen_rdma_reg(new_smc, local_contact)) {
+ if (!ism_supported &&
+ ((pclc->hdr.path != SMC_TYPE_R && pclc->hdr.path != SMC_TYPE_B) ||
+ smc_check_rdma(new_smc, &ibdev, &ibport) ||
+ smc_listen_rdma_check(new_smc, pclc) ||
+ smc_listen_rdma_init(new_smc, pclc, ibdev, ibport,
+ &local_contact) ||
+ smc_listen_rdma_reg(new_smc, local_contact))) {
/* SMC not supported, decline */
mutex_unlock(&smc_create_lgr_pending);
smc_listen_decline(new_smc, SMC_CLC_DECL_CNFERR, local_contact);
@@ -1095,7 +1276,8 @@ static void smc_listen_work(struct work_struct *work)
}
/* finish worker */
- smc_listen_rdma_finish(new_smc, &cclc, local_contact);
+ if (!ism_supported)
+ smc_listen_rdma_finish(new_smc, &cclc, local_contact);
smc_conn_save_peer_info(new_smc, &cclc);
mutex_unlock(&smc_create_lgr_pending);
smc_listen_out_connected(new_smc);
diff --git a/net/smc/smc.h b/net/smc/smc.h
index d7ca26570482..be20acd7b5ab 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -21,8 +21,6 @@
#define SMCPROTO_SMC 0 /* SMC protocol, IPv4 */
#define SMCPROTO_SMC6 1 /* SMC protocol, IPv6 */
-#define SMC_MAX_PORTS 2 /* Max # of ports */
-
extern struct proto smc_proto;
extern struct proto smc_proto6;
@@ -185,6 +183,11 @@ struct smc_connection {
spinlock_t acurs_lock; /* protect cursors */
#endif
struct work_struct close_work; /* peer sent some closing */
+ struct tasklet_struct rx_tsklet; /* Receiver tasklet for SMC-D */
+ u8 rx_off; /* receive offset:
+ * 0 for SMC-R, 32 for SMC-D
+ */
+ u64 peer_token; /* SMC-D token of peer */
};
struct smc_connect_info {
diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c
index a7e8d63fc8ae..621d8cca570b 100644
--- a/net/smc/smc_cdc.c
+++ b/net/smc/smc_cdc.c
@@ -117,7 +117,7 @@ int smc_cdc_msg_send(struct smc_connection *conn,
return rc;
}
-int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn)
+static int smcr_cdc_get_slot_and_msg_send(struct smc_connection *conn)
{
struct smc_cdc_tx_pend *pend;
struct smc_wr_buf *wr_buf;
@@ -130,6 +130,21 @@ int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn)
return smc_cdc_msg_send(conn, wr_buf, pend);
}
+int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn)
+{
+ int rc;
+
+ if (conn->lgr->is_smcd) {
+ spin_lock_bh(&conn->send_lock);
+ rc = smcd_cdc_msg_send(conn);
+ spin_unlock_bh(&conn->send_lock);
+ } else {
+ rc = smcr_cdc_get_slot_and_msg_send(conn);
+ }
+
+ return rc;
+}
+
static bool smc_cdc_tx_filter(struct smc_wr_tx_pend_priv *tx_pend,
unsigned long data)
{
@@ -157,6 +172,45 @@ void smc_cdc_tx_dismiss_slots(struct smc_connection *conn)
(unsigned long)conn);
}
+/* Send a SMC-D CDC header.
+ * This increments the free space available in our send buffer.
+ * Also update the confirmed receive buffer with what was sent to the peer.
+ */
+int smcd_cdc_msg_send(struct smc_connection *conn)
+{
+ struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
+ struct smcd_cdc_msg cdc;
+ int rc, diff;
+
+ memset(&cdc, 0, sizeof(cdc));
+ cdc.common.type = SMC_CDC_MSG_TYPE;
+ cdc.prod_wrap = conn->local_tx_ctrl.prod.wrap;
+ cdc.prod_count = conn->local_tx_ctrl.prod.count;
+
+ cdc.cons_wrap = conn->local_tx_ctrl.cons.wrap;
+ cdc.cons_count = conn->local_tx_ctrl.cons.count;
+ cdc.prod_flags = conn->local_tx_ctrl.prod_flags;
+ cdc.conn_state_flags = conn->local_tx_ctrl.conn_state_flags;
+ rc = smcd_tx_ism_write(conn, &cdc, sizeof(cdc), 0, 1);
+ if (rc)
+ return rc;
+ smc_curs_write(&conn->rx_curs_confirmed,
+ smc_curs_read(&conn->local_tx_ctrl.cons, conn), conn);
+ /* Calculate transmitted data and increment free send buffer space */
+ diff = smc_curs_diff(conn->sndbuf_desc->len, &conn->tx_curs_fin,
+ &conn->tx_curs_sent);
+ /* increased by confirmed number of bytes */
+ smp_mb__before_atomic();
+ atomic_add(diff, &conn->sndbuf_space);
+ /* guarantee 0 <= sndbuf_space <= sndbuf_desc->len */
+ smp_mb__after_atomic();
+ smc_curs_write(&conn->tx_curs_fin,
+ smc_curs_read(&conn->tx_curs_sent, conn), conn);
+
+ smc_tx_sndbuf_nonfull(smc);
+ return rc;
+}
+
/********************************* receive ***********************************/
static inline bool smc_cdc_before(u16 seq1, u16 seq2)
@@ -178,7 +232,7 @@ static void smc_cdc_handle_urg_data_arrival(struct smc_sock *smc,
if (!sock_flag(&smc->sk, SOCK_URGINLINE))
/* we'll skip the urgent byte, so don't account for it */
(*diff_prod)--;
- base = (char *)conn->rmb_desc->cpu_addr;
+ base = (char *)conn->rmb_desc->cpu_addr + conn->rx_off;
if (conn->urg_curs.count)
conn->urg_rx_byte = *(base + conn->urg_curs.count - 1);
else
@@ -276,6 +330,34 @@ static void smc_cdc_msg_recv(struct smc_sock *smc, struct smc_cdc_msg *cdc)
sock_put(&smc->sk); /* no free sk in softirq-context */
}
+/* Schedule a tasklet for this connection. Triggered from the ISM device IRQ
+ * handler to indicate update in the DMBE.
+ *
+ * Context:
+ * - tasklet context
+ */
+static void smcd_cdc_rx_tsklet(unsigned long data)
+{
+ struct smc_connection *conn = (struct smc_connection *)data;
+ struct smcd_cdc_msg cdc;
+ struct smc_sock *smc;
+
+ if (!conn)
+ return;
+
+ memcpy(&cdc, conn->rmb_desc->cpu_addr, sizeof(cdc));
+ smc = container_of(conn, struct smc_sock, conn);
+ smc_cdc_msg_recv(smc, (struct smc_cdc_msg *)&cdc);
+}
+
+/* Initialize receive tasklet. Called from ISM device IRQ handler to start
+ * receiver side.
+ */
+void smcd_cdc_rx_init(struct smc_connection *conn)
+{
+ tasklet_init(&conn->rx_tsklet, smcd_cdc_rx_tsklet, (unsigned long)conn);
+}
+
/***************************** init, exit, misc ******************************/
static void smc_cdc_rx_handler(struct ib_wc *wc, void *buf)
diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h
index f60082fee5b8..8fbce4fee3e4 100644
--- a/net/smc/smc_cdc.h
+++ b/net/smc/smc_cdc.h
@@ -50,6 +50,20 @@ struct smc_cdc_msg {
u8 reserved[18];
} __packed; /* format defined in RFC7609 */
+/* CDC message for SMC-D */
+struct smcd_cdc_msg {
+ struct smc_wr_rx_hdr common; /* Type = 0xFE */
+ u8 res1[7];
+ u16 prod_wrap;
+ u32 prod_count;
+ u8 res2[2];
+ u16 cons_wrap;
+ u32 cons_count;
+ struct smc_cdc_producer_flags prod_flags;
+ struct smc_cdc_conn_state_flags conn_state_flags;
+ u8 res3[8];
+} __packed;
+
static inline bool smc_cdc_rxed_any_close(struct smc_connection *conn)
{
return conn->local_rx_ctrl.conn_state_flags.peer_conn_abort ||
@@ -204,9 +218,9 @@ static inline void smc_cdc_cursor_to_host(union smc_host_cursor *local,
smc_curs_write(local, smc_curs_read(&temp, conn), conn);
}
-static inline void smc_cdc_msg_to_host(struct smc_host_cdc_msg *local,
- struct smc_cdc_msg *peer,
- struct smc_connection *conn)
+static inline void smcr_cdc_msg_to_host(struct smc_host_cdc_msg *local,
+ struct smc_cdc_msg *peer,
+ struct smc_connection *conn)
{
local->common.type = peer->common.type;
local->len = peer->len;
@@ -218,6 +232,27 @@ static inline void smc_cdc_msg_to_host(struct smc_host_cdc_msg *local,
local->conn_state_flags = peer->conn_state_flags;
}
+static inline void smcd_cdc_msg_to_host(struct smc_host_cdc_msg *local,
+ struct smcd_cdc_msg *peer)
+{
+ local->prod.wrap = peer->prod_wrap;
+ local->prod.count = peer->prod_count;
+ local->cons.wrap = peer->cons_wrap;
+ local->cons.count = peer->cons_count;
+ local->prod_flags = peer->prod_flags;
+ local->conn_state_flags = peer->conn_state_flags;
+}
+
+static inline void smc_cdc_msg_to_host(struct smc_host_cdc_msg *local,
+ struct smc_cdc_msg *peer,
+ struct smc_connection *conn)
+{
+ if (conn->lgr->is_smcd)
+ smcd_cdc_msg_to_host(local, (struct smcd_cdc_msg *)peer);
+ else
+ smcr_cdc_msg_to_host(local, peer, conn);
+}
+
struct smc_cdc_tx_pend;
int smc_cdc_get_free_slot(struct smc_connection *conn,
@@ -227,6 +262,8 @@ void smc_cdc_tx_dismiss_slots(struct smc_connection *conn);
int smc_cdc_msg_send(struct smc_connection *conn, struct smc_wr_buf *wr_buf,
struct smc_cdc_tx_pend *pend);
int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn);
+int smcd_cdc_msg_send(struct smc_connection *conn);
int smc_cdc_init(void) __init;
+void smcd_cdc_rx_init(struct smc_connection *conn);
#endif /* SMC_CDC_H */
diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c
index ae5d168653ce..ad39efdb4f1c 100644
--- a/net/smc/smc_clc.c
+++ b/net/smc/smc_clc.c
@@ -23,9 +23,15 @@
#include "smc_core.h"
#include "smc_clc.h"
#include "smc_ib.h"
+#include "smc_ism.h"
+
+#define SMCR_CLC_ACCEPT_CONFIRM_LEN 68
+#define SMCD_CLC_ACCEPT_CONFIRM_LEN 48
/* eye catcher "SMCR" EBCDIC for CLC messages */
static const char SMC_EYECATCHER[4] = {'\xe2', '\xd4', '\xc3', '\xd9'};
+/* eye catcher "SMCD" EBCDIC for CLC messages */
+static const char SMCD_EYECATCHER[4] = {'\xe2', '\xd4', '\xc3', '\xc4'};
/* check if received message has a correct header length and contains valid
* heading and trailing eyecatchers
@@ -38,10 +44,14 @@ static bool smc_clc_msg_hdr_valid(struct smc_clc_msg_hdr *clcm)
struct smc_clc_msg_decline *dclc;
struct smc_clc_msg_trail *trl;
- if (memcmp(clcm->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)))
+ if (memcmp(clcm->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)) &&
+ memcmp(clcm->eyecatcher, SMCD_EYECATCHER, sizeof(SMCD_EYECATCHER)))
return false;
switch (clcm->type) {
case SMC_CLC_PROPOSAL:
+ if (clcm->path != SMC_TYPE_R && clcm->path != SMC_TYPE_D &&
+ clcm->path != SMC_TYPE_B)
+ return false;
pclc = (struct smc_clc_msg_proposal *)clcm;
pclc_prfx = smc_clc_proposal_get_prefix(pclc);
if (ntohs(pclc->hdr.length) !=
@@ -56,10 +66,16 @@ static bool smc_clc_msg_hdr_valid(struct smc_clc_msg_hdr *clcm)
break;
case SMC_CLC_ACCEPT:
case SMC_CLC_CONFIRM:
+ if (clcm->path != SMC_TYPE_R && clcm->path != SMC_TYPE_D)
+ return false;
clc = (struct smc_clc_msg_accept_confirm *)clcm;
- if (ntohs(clc->hdr.length) != sizeof(*clc))
+ if ((clcm->path == SMC_TYPE_R &&
+ ntohs(clc->hdr.length) != SMCR_CLC_ACCEPT_CONFIRM_LEN) ||
+ (clcm->path == SMC_TYPE_D &&
+ ntohs(clc->hdr.length) != SMCD_CLC_ACCEPT_CONFIRM_LEN))
return false;
- trl = &clc->trl;
+ trl = (struct smc_clc_msg_trail *)
+ ((u8 *)clc + ntohs(clc->hdr.length) - sizeof(*trl));
break;
case SMC_CLC_DECLINE:
dclc = (struct smc_clc_msg_decline *)clcm;
@@ -70,7 +86,8 @@ static bool smc_clc_msg_hdr_valid(struct smc_clc_msg_hdr *clcm)
default:
return false;
}
- if (memcmp(trl->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)))
+ if (memcmp(trl->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)) &&
+ memcmp(trl->eyecatcher, SMCD_EYECATCHER, sizeof(SMCD_EYECATCHER)))
return false;
return true;
}
@@ -296,6 +313,9 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
datlen = ntohs(clcm->length);
if ((len < sizeof(struct smc_clc_msg_hdr)) ||
(datlen > buflen) ||
+ (clcm->version != SMC_CLC_V1) ||
+ (clcm->path != SMC_TYPE_R && clcm->path != SMC_TYPE_D &&
+ clcm->path != SMC_TYPE_B) ||
((clcm->type != SMC_CLC_DECLINE) &&
(clcm->type != expected_type))) {
smc->sk.sk_err = EPROTO;
@@ -357,17 +377,18 @@ int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info)
}
/* send CLC PROPOSAL message across internal TCP socket */
-int smc_clc_send_proposal(struct smc_sock *smc,
- struct smc_ib_device *smcibdev,
- u8 ibport)
+int smc_clc_send_proposal(struct smc_sock *smc, int smc_type,
+ struct smc_ib_device *ibdev, u8 ibport,
+ struct smcd_dev *ismdev)
{
struct smc_clc_ipv6_prefix ipv6_prfx[SMC_CLC_MAX_V6_PREFIX];
struct smc_clc_msg_proposal_prefix pclc_prfx;
+ struct smc_clc_msg_smcd pclc_smcd;
struct smc_clc_msg_proposal pclc;
struct smc_clc_msg_trail trl;
int len, i, plen, rc;
int reason_code = 0;
- struct kvec vec[4];
+ struct kvec vec[5];
struct msghdr msg;
/* retrieve ip prefixes for CLC proposal msg */
@@ -382,18 +403,34 @@ int smc_clc_send_proposal(struct smc_sock *smc,
memset(&pclc, 0, sizeof(pclc));
memcpy(pclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
pclc.hdr.type = SMC_CLC_PROPOSAL;
- pclc.hdr.length = htons(plen);
pclc.hdr.version = SMC_CLC_V1; /* SMC version */
- memcpy(pclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid));
- memcpy(&pclc.lcl.gid, &smcibdev->gid[ibport - 1], SMC_GID_SIZE);
- memcpy(&pclc.lcl.mac, &smcibdev->mac[ibport - 1], ETH_ALEN);
- pclc.iparea_offset = htons(0);
+ pclc.hdr.path = smc_type;
+ if (smc_type == SMC_TYPE_R || smc_type == SMC_TYPE_B) {
+ /* add SMC-R specifics */
+ memcpy(pclc.lcl.id_for_peer, local_systemid,
+ sizeof(local_systemid));
+ memcpy(&pclc.lcl.gid, &ibdev->gid[ibport - 1], SMC_GID_SIZE);
+ memcpy(&pclc.lcl.mac, &ibdev->mac[ibport - 1], ETH_ALEN);
+ pclc.iparea_offset = htons(0);
+ }
+ if (smc_type == SMC_TYPE_D || smc_type == SMC_TYPE_B) {
+ /* add SMC-D specifics */
+ memset(&pclc_smcd, 0, sizeof(pclc_smcd));
+ plen += sizeof(pclc_smcd);
+ pclc.iparea_offset = htons(SMC_CLC_PROPOSAL_MAX_OFFSET);
+ pclc_smcd.gid = ismdev->local_gid;
+ }
+ pclc.hdr.length = htons(plen);
memcpy(trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
memset(&msg, 0, sizeof(msg));
i = 0;
vec[i].iov_base = &pclc;
vec[i++].iov_len = sizeof(pclc);
+ if (smc_type == SMC_TYPE_D || smc_type == SMC_TYPE_B) {
+ vec[i].iov_base = &pclc_smcd;
+ vec[i++].iov_len = sizeof(pclc_smcd);
+ }
vec[i].iov_base = &pclc_prfx;
vec[i++].iov_len = sizeof(pclc_prfx);
if (pclc_prfx.ipv6_prefixes_cnt > 0) {
@@ -429,35 +466,56 @@ int smc_clc_send_confirm(struct smc_sock *smc)
struct kvec vec;
int len;
- link = &conn->lgr->lnk[SMC_SINGLE_LINK];
/* send SMC Confirm CLC msg */
memset(&cclc, 0, sizeof(cclc));
- memcpy(cclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
cclc.hdr.type = SMC_CLC_CONFIRM;
- cclc.hdr.length = htons(sizeof(cclc));
cclc.hdr.version = SMC_CLC_V1; /* SMC version */
- memcpy(cclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid));
- memcpy(&cclc.lcl.gid, &link->smcibdev->gid[link->ibport - 1],
- SMC_GID_SIZE);
- memcpy(&cclc.lcl.mac, &link->smcibdev->mac[link->ibport - 1], ETH_ALEN);
- hton24(cclc.qpn, link->roce_qp->qp_num);
- cclc.rmb_rkey =
- htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey);
- cclc.rmbe_idx = 1; /* for now: 1 RMB = 1 RMBE */
- cclc.rmbe_alert_token = htonl(conn->alert_token_local);
- cclc.qp_mtu = min(link->path_mtu, link->peer_mtu);
- cclc.rmbe_size = conn->rmbe_size_short;
- cclc.rmb_dma_addr = cpu_to_be64(
- (u64)sg_dma_address(conn->rmb_desc->sgt[SMC_SINGLE_LINK].sgl));
- hton24(cclc.psn, link->psn_initial);
-
- memcpy(cclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+ if (smc->conn.lgr->is_smcd) {
+ /* SMC-D specific settings */
+ memcpy(cclc.hdr.eyecatcher, SMCD_EYECATCHER,
+ sizeof(SMCD_EYECATCHER));
+ cclc.hdr.path = SMC_TYPE_D;
+ cclc.hdr.length = htons(SMCD_CLC_ACCEPT_CONFIRM_LEN);
+ cclc.gid = conn->lgr->smcd->local_gid;
+ cclc.token = conn->rmb_desc->token;
+ cclc.dmbe_size = conn->rmbe_size_short;
+ cclc.dmbe_idx = 0;
+ memcpy(&cclc.linkid, conn->lgr->id, SMC_LGR_ID_SIZE);
+ memcpy(cclc.smcd_trl.eyecatcher, SMCD_EYECATCHER,
+ sizeof(SMCD_EYECATCHER));
+ } else {
+ /* SMC-R specific settings */
+ link = &conn->lgr->lnk[SMC_SINGLE_LINK];
+ memcpy(cclc.hdr.eyecatcher, SMC_EYECATCHER,
+ sizeof(SMC_EYECATCHER));
+ cclc.hdr.path = SMC_TYPE_R;
+ cclc.hdr.length = htons(SMCR_CLC_ACCEPT_CONFIRM_LEN);
+ memcpy(cclc.lcl.id_for_peer, local_systemid,
+ sizeof(local_systemid));
+ memcpy(&cclc.lcl.gid, &link->smcibdev->gid[link->ibport - 1],
+ SMC_GID_SIZE);
+ memcpy(&cclc.lcl.mac, &link->smcibdev->mac[link->ibport - 1],
+ ETH_ALEN);
+ hton24(cclc.qpn, link->roce_qp->qp_num);
+ cclc.rmb_rkey =
+ htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey);
+ cclc.rmbe_idx = 1; /* for now: 1 RMB = 1 RMBE */
+ cclc.rmbe_alert_token = htonl(conn->alert_token_local);
+ cclc.qp_mtu = min(link->path_mtu, link->peer_mtu);
+ cclc.rmbe_size = conn->rmbe_size_short;
+ cclc.rmb_dma_addr = cpu_to_be64((u64)sg_dma_address
+ (conn->rmb_desc->sgt[SMC_SINGLE_LINK].sgl));
+ hton24(cclc.psn, link->psn_initial);
+ memcpy(cclc.smcr_trl.eyecatcher, SMC_EYECATCHER,
+ sizeof(SMC_EYECATCHER));
+ }
memset(&msg, 0, sizeof(msg));
vec.iov_base = &cclc;
- vec.iov_len = sizeof(cclc);
- len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, sizeof(cclc));
- if (len < sizeof(cclc)) {
+ vec.iov_len = ntohs(cclc.hdr.length);
+ len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1,
+ ntohs(cclc.hdr.length));
+ if (len < ntohs(cclc.hdr.length)) {
if (len >= 0) {
reason_code = -ENETUNREACH;
smc->sk.sk_err = -reason_code;
@@ -480,35 +538,58 @@ int smc_clc_send_accept(struct smc_sock *new_smc, int srv_first_contact)
int rc = 0;
int len;
- link = &conn->lgr->lnk[SMC_SINGLE_LINK];
memset(&aclc, 0, sizeof(aclc));
- memcpy(aclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
aclc.hdr.type = SMC_CLC_ACCEPT;
- aclc.hdr.length = htons(sizeof(aclc));
aclc.hdr.version = SMC_CLC_V1; /* SMC version */
if (srv_first_contact)
aclc.hdr.flag = 1;
- memcpy(aclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid));
- memcpy(&aclc.lcl.gid, &link->smcibdev->gid[link->ibport - 1],
- SMC_GID_SIZE);
- memcpy(&aclc.lcl.mac, link->smcibdev->mac[link->ibport - 1], ETH_ALEN);
- hton24(aclc.qpn, link->roce_qp->qp_num);
- aclc.rmb_rkey =
- htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey);
- aclc.rmbe_idx = 1; /* as long as 1 RMB = 1 RMBE */
- aclc.rmbe_alert_token = htonl(conn->alert_token_local);
- aclc.qp_mtu = link->path_mtu;
- aclc.rmbe_size = conn->rmbe_size_short,
- aclc.rmb_dma_addr = cpu_to_be64(
- (u64)sg_dma_address(conn->rmb_desc->sgt[SMC_SINGLE_LINK].sgl));
- hton24(aclc.psn, link->psn_initial);
- memcpy(aclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+
+ if (new_smc->conn.lgr->is_smcd) {
+ /* SMC-D specific settings */
+ aclc.hdr.length = htons(SMCD_CLC_ACCEPT_CONFIRM_LEN);
+ memcpy(aclc.hdr.eyecatcher, SMCD_EYECATCHER,
+ sizeof(SMCD_EYECATCHER));
+ aclc.hdr.path = SMC_TYPE_D;
+ aclc.gid = conn->lgr->smcd->local_gid;
+ aclc.token = conn->rmb_desc->token;
+ aclc.dmbe_size = conn->rmbe_size_short;
+ aclc.dmbe_idx = 0;
+ memcpy(&aclc.linkid, conn->lgr->id, SMC_LGR_ID_SIZE);
+ memcpy(aclc.smcd_trl.eyecatcher, SMCD_EYECATCHER,
+ sizeof(SMCD_EYECATCHER));
+ } else {
+ /* SMC-R specific settings */
+ aclc.hdr.length = htons(SMCR_CLC_ACCEPT_CONFIRM_LEN);
+ memcpy(aclc.hdr.eyecatcher, SMC_EYECATCHER,
+ sizeof(SMC_EYECATCHER));
+ aclc.hdr.path = SMC_TYPE_R;
+ link = &conn->lgr->lnk[SMC_SINGLE_LINK];
+ memcpy(aclc.lcl.id_for_peer, local_systemid,
+ sizeof(local_systemid));
+ memcpy(&aclc.lcl.gid, &link->smcibdev->gid[link->ibport - 1],
+ SMC_GID_SIZE);
+ memcpy(&aclc.lcl.mac, link->smcibdev->mac[link->ibport - 1],
+ ETH_ALEN);
+ hton24(aclc.qpn, link->roce_qp->qp_num);
+ aclc.rmb_rkey =
+ htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey);
+ aclc.rmbe_idx = 1; /* as long as 1 RMB = 1 RMBE */
+ aclc.rmbe_alert_token = htonl(conn->alert_token_local);
+ aclc.qp_mtu = link->path_mtu;
+ aclc.rmbe_size = conn->rmbe_size_short,
+ aclc.rmb_dma_addr = cpu_to_be64((u64)sg_dma_address
+ (conn->rmb_desc->sgt[SMC_SINGLE_LINK].sgl));
+ hton24(aclc.psn, link->psn_initial);
+ memcpy(aclc.smcr_trl.eyecatcher, SMC_EYECATCHER,
+ sizeof(SMC_EYECATCHER));
+ }
memset(&msg, 0, sizeof(msg));
vec.iov_base = &aclc;
- vec.iov_len = sizeof(aclc);
- len = kernel_sendmsg(new_smc->clcsock, &msg, &vec, 1, sizeof(aclc));
- if (len < sizeof(aclc)) {
+ vec.iov_len = ntohs(aclc.hdr.length);
+ len = kernel_sendmsg(new_smc->clcsock, &msg, &vec, 1,
+ ntohs(aclc.hdr.length));
+ if (len < ntohs(aclc.hdr.length)) {
if (len >= 0)
new_smc->sk.sk_err = EPROTO;
else
diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h
index 41ff9ea96139..100e988ad1a8 100644
--- a/net/smc/smc_clc.h
+++ b/net/smc/smc_clc.h
@@ -23,6 +23,9 @@
#define SMC_CLC_DECLINE 0x04
#define SMC_CLC_V1 0x1 /* SMC version */
+#define SMC_TYPE_R 0 /* SMC-R only */
+#define SMC_TYPE_D 1 /* SMC-D only */
+#define SMC_TYPE_B 3 /* SMC-R and SMC-D */
#define CLC_WAIT_TIME (6 * HZ) /* max. wait time on clcsock */
#define SMC_CLC_DECL_MEM 0x01010000 /* insufficient memory resources */
#define SMC_CLC_DECL_TIMEOUT 0x02000000 /* timeout */
@@ -42,9 +45,11 @@ struct smc_clc_msg_hdr { /* header1 of clc messages */
#if defined(__BIG_ENDIAN_BITFIELD)
u8 version : 4,
flag : 1,
- rsvd : 3;
+ rsvd : 1,
+ path : 2;
#elif defined(__LITTLE_ENDIAN_BITFIELD)
- u8 rsvd : 3,
+ u8 path : 2,
+ rsvd : 1,
flag : 1,
version : 4;
#endif
@@ -77,6 +82,11 @@ struct smc_clc_msg_proposal_prefix { /* prefix part of clc proposal message*/
u8 ipv6_prefixes_cnt; /* number of IPv6 prefixes in prefix array */
} __aligned(4);
+struct smc_clc_msg_smcd { /* SMC-D GID information */
+ u64 gid; /* ISM GID of requestor */
+ u8 res[32];
+};
+
struct smc_clc_msg_proposal { /* clc proposal message sent by Linux */
struct smc_clc_msg_hdr hdr;
struct smc_clc_msg_local lcl;
@@ -94,23 +104,45 @@ struct smc_clc_msg_proposal { /* clc proposal message sent by Linux */
struct smc_clc_msg_accept_confirm { /* clc accept / confirm message */
struct smc_clc_msg_hdr hdr;
- struct smc_clc_msg_local lcl;
- u8 qpn[3]; /* QP number */
- __be32 rmb_rkey; /* RMB rkey */
- u8 rmbe_idx; /* Index of RMBE in RMB */
- __be32 rmbe_alert_token;/* unique connection id */
+ union {
+ struct { /* SMC-R */
+ struct smc_clc_msg_local lcl;
+ u8 qpn[3]; /* QP number */
+ __be32 rmb_rkey; /* RMB rkey */
+ u8 rmbe_idx; /* Index of RMBE in RMB */
+ __be32 rmbe_alert_token;/* unique connection id */
#if defined(__BIG_ENDIAN_BITFIELD)
- u8 rmbe_size : 4, /* RMBE buf size (compressed notation) */
- qp_mtu : 4; /* QP mtu */
+ u8 rmbe_size : 4, /* buf size (compressed) */
+ qp_mtu : 4; /* QP mtu */
#elif defined(__LITTLE_ENDIAN_BITFIELD)
- u8 qp_mtu : 4,
- rmbe_size : 4;
+ u8 qp_mtu : 4,
+ rmbe_size : 4;
#endif
- u8 reserved;
- __be64 rmb_dma_addr; /* RMB virtual address */
- u8 reserved2;
- u8 psn[3]; /* initial packet sequence number */
- struct smc_clc_msg_trail trl; /* eye catcher "SMCR" EBCDIC */
+ u8 reserved;
+ __be64 rmb_dma_addr; /* RMB virtual address */
+ u8 reserved2;
+ u8 psn[3]; /* packet sequence number */
+ struct smc_clc_msg_trail smcr_trl;
+ /* eye catcher "SMCR" EBCDIC */
+ } __packed;
+ struct { /* SMC-D */
+ u64 gid; /* Sender GID */
+ u64 token; /* DMB token */
+ u8 dmbe_idx; /* DMBE index */
+#if defined(__BIG_ENDIAN_BITFIELD)
+ u8 dmbe_size : 4, /* buf size (compressed) */
+ reserved3 : 4;
+#elif defined(__LITTLE_ENDIAN_BITFIELD)
+ u8 reserved3 : 4,
+ dmbe_size : 4;
+#endif
+ u16 reserved4;
+ u32 linkid; /* Link identifier */
+ u32 reserved5[3];
+ struct smc_clc_msg_trail smcd_trl;
+ /* eye catcher "SMCD" EBCDIC */
+ } __packed;
+ };
} __packed; /* format defined in RFC7609 */
struct smc_clc_msg_decline { /* clc decline message */
@@ -129,13 +161,26 @@ smc_clc_proposal_get_prefix(struct smc_clc_msg_proposal *pclc)
((u8 *)pclc + sizeof(*pclc) + ntohs(pclc->iparea_offset));
}
+/* get SMC-D info from proposal message */
+static inline struct smc_clc_msg_smcd *
+smc_get_clc_msg_smcd(struct smc_clc_msg_proposal *prop)
+{
+ if (ntohs(prop->iparea_offset) != sizeof(struct smc_clc_msg_smcd))
+ return NULL;
+
+ return (struct smc_clc_msg_smcd *)(prop + 1);
+}
+
+struct smcd_dev;
+
int smc_clc_prfx_match(struct socket *clcsock,
struct smc_clc_msg_proposal_prefix *prop);
int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
u8 expected_type);
int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info);
-int smc_clc_send_proposal(struct smc_sock *smc, struct smc_ib_device *smcibdev,
- u8 ibport);
+int smc_clc_send_proposal(struct smc_sock *smc, int smc_type,
+ struct smc_ib_device *smcibdev, u8 ibport,
+ struct smcd_dev *ismdev);
int smc_clc_send_confirm(struct smc_sock *smc);
int smc_clc_send_accept(struct smc_sock *smc, int srv_first_contact);
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index add82b0266f3..66741e61a3b0 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -25,6 +25,7 @@
#include "smc_llc.h"
#include "smc_cdc.h"
#include "smc_close.h"
+#include "smc_ism.h"
#define SMC_LGR_NUM_INCR 256
#define SMC_LGR_FREE_DELAY_SERV (600 * HZ)
@@ -46,8 +47,8 @@ static void smc_lgr_schedule_free_work(struct smc_link_group *lgr)
* otherwise there is a risk of out-of-sync link groups.
*/
mod_delayed_work(system_wq, &lgr->free_work,
- lgr->role == SMC_CLNT ? SMC_LGR_FREE_DELAY_CLNT :
- SMC_LGR_FREE_DELAY_SERV);
+ (!lgr->is_smcd && lgr->role == SMC_CLNT) ?
+ SMC_LGR_FREE_DELAY_CLNT : SMC_LGR_FREE_DELAY_SERV);
}
/* Register connection's alert token in our lookup structure.
@@ -153,16 +154,18 @@ static void smc_lgr_free_work(struct work_struct *work)
free:
spin_unlock_bh(&smc_lgr_list.lock);
if (!delayed_work_pending(&lgr->free_work)) {
- if (lgr->lnk[SMC_SINGLE_LINK].state != SMC_LNK_INACTIVE)
+ if (!lgr->is_smcd &&
+ lgr->lnk[SMC_SINGLE_LINK].state != SMC_LNK_INACTIVE)
smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]);
smc_lgr_free(lgr);
}
}
/* create a new SMC link group */
-static int smc_lgr_create(struct smc_sock *smc,
+static int smc_lgr_create(struct smc_sock *smc, bool is_smcd,
struct smc_ib_device *smcibdev, u8 ibport,
- char *peer_systemid, unsigned short vlan_id)
+ char *peer_systemid, unsigned short vlan_id,
+ struct smcd_dev *smcismdev, u64 peer_gid)
{
struct smc_link_group *lgr;
struct smc_link *lnk;
@@ -170,17 +173,23 @@ static int smc_lgr_create(struct smc_sock *smc,
int rc = 0;
int i;
+ if (is_smcd && vlan_id) {
+ rc = smc_ism_get_vlan(smcismdev, vlan_id);
+ if (rc)
+ goto out;
+ }
+
lgr = kzalloc(sizeof(*lgr), GFP_KERNEL);
if (!lgr) {
rc = -ENOMEM;
goto out;
}
- lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
+ lgr->is_smcd = is_smcd;
lgr->sync_err = 0;
- memcpy(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN);
lgr->vlan_id = vlan_id;
rwlock_init(&lgr->sndbufs_lock);
rwlock_init(&lgr->rmbs_lock);
+ rwlock_init(&lgr->conns_lock);
for (i = 0; i < SMC_RMBE_SIZES; i++) {
INIT_LIST_HEAD(&lgr->sndbufs[i]);
INIT_LIST_HEAD(&lgr->rmbs[i]);
@@ -189,36 +198,44 @@ static int smc_lgr_create(struct smc_sock *smc,
memcpy(&lgr->id, (u8 *)&smc_lgr_list.num, SMC_LGR_ID_SIZE);
INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work);
lgr->conns_all = RB_ROOT;
-
- lnk = &lgr->lnk[SMC_SINGLE_LINK];
- /* initialize link */
- lnk->state = SMC_LNK_ACTIVATING;
- lnk->link_id = SMC_SINGLE_LINK;
- lnk->smcibdev = smcibdev;
- lnk->ibport = ibport;
- lnk->path_mtu = smcibdev->pattr[ibport - 1].active_mtu;
- if (!smcibdev->initialized)
- smc_ib_setup_per_ibdev(smcibdev);
- get_random_bytes(rndvec, sizeof(rndvec));
- lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + (rndvec[2] << 16);
- rc = smc_llc_link_init(lnk);
- if (rc)
- goto free_lgr;
- rc = smc_wr_alloc_link_mem(lnk);
- if (rc)
- goto clear_llc_lnk;
- rc = smc_ib_create_protection_domain(lnk);
- if (rc)
- goto free_link_mem;
- rc = smc_ib_create_queue_pair(lnk);
- if (rc)
- goto dealloc_pd;
- rc = smc_wr_create_link(lnk);
- if (rc)
- goto destroy_qp;
-
+ if (is_smcd) {
+ /* SMC-D specific settings */
+ lgr->peer_gid = peer_gid;
+ lgr->smcd = smcismdev;
+ } else {
+ /* SMC-R specific settings */
+ lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
+ memcpy(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN);
+
+ lnk = &lgr->lnk[SMC_SINGLE_LINK];
+ /* initialize link */
+ lnk->state = SMC_LNK_ACTIVATING;
+ lnk->link_id = SMC_SINGLE_LINK;
+ lnk->smcibdev = smcibdev;
+ lnk->ibport = ibport;
+ lnk->path_mtu = smcibdev->pattr[ibport - 1].active_mtu;
+ if (!smcibdev->initialized)
+ smc_ib_setup_per_ibdev(smcibdev);
+ get_random_bytes(rndvec, sizeof(rndvec));
+ lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) +
+ (rndvec[2] << 16);
+ rc = smc_llc_link_init(lnk);
+ if (rc)
+ goto free_lgr;
+ rc = smc_wr_alloc_link_mem(lnk);
+ if (rc)
+ goto clear_llc_lnk;
+ rc = smc_ib_create_protection_domain(lnk);
+ if (rc)
+ goto free_link_mem;
+ rc = smc_ib_create_queue_pair(lnk);
+ if (rc)
+ goto dealloc_pd;
+ rc = smc_wr_create_link(lnk);
+ if (rc)
+ goto destroy_qp;
+ }
smc->conn.lgr = lgr;
- rwlock_init(&lgr->conns_lock);
spin_lock_bh(&smc_lgr_list.lock);
list_add(&lgr->list, &smc_lgr_list.list);
spin_unlock_bh(&smc_lgr_list.lock);
@@ -264,7 +281,12 @@ void smc_conn_free(struct smc_connection *conn)
{
if (!conn->lgr)
return;
- smc_cdc_tx_dismiss_slots(conn);
+ if (conn->lgr->is_smcd) {
+ smc_ism_unset_conn(conn);
+ tasklet_kill(&conn->rx_tsklet);
+ } else {
+ smc_cdc_tx_dismiss_slots(conn);
+ }
smc_lgr_unregister_conn(conn);
smc_buf_unuse(conn);
}
@@ -280,8 +302,8 @@ static void smc_link_clear(struct smc_link *lnk)
smc_wr_free_link_mem(lnk);
}
-static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb,
- struct smc_buf_desc *buf_desc)
+static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb,
+ struct smc_buf_desc *buf_desc)
{
struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];
@@ -301,6 +323,28 @@ static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb,
kfree(buf_desc);
}
+static void smcd_buf_free(struct smc_link_group *lgr, bool is_dmb,
+ struct smc_buf_desc *buf_desc)
+{
+ if (is_dmb) {
+ /* restore original buf len */
+ buf_desc->len += sizeof(struct smcd_cdc_msg);
+ smc_ism_unregister_dmb(lgr->smcd, buf_desc);
+ } else {
+ kfree(buf_desc->cpu_addr);
+ }
+ kfree(buf_desc);
+}
+
+static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb,
+ struct smc_buf_desc *buf_desc)
+{
+ if (lgr->is_smcd)
+ smcd_buf_free(lgr, is_rmb, buf_desc);
+ else
+ smcr_buf_free(lgr, is_rmb, buf_desc);
+}
+
static void __smc_lgr_free_bufs(struct smc_link_group *lgr, bool is_rmb)
{
struct smc_buf_desc *buf_desc, *bf_desc;
@@ -332,7 +376,10 @@ static void smc_lgr_free_bufs(struct smc_link_group *lgr)
void smc_lgr_free(struct smc_link_group *lgr)
{
smc_lgr_free_bufs(lgr);
- smc_link_clear(&lgr->lnk[SMC_SINGLE_LINK]);
+ if (lgr->is_smcd)
+ smc_ism_put_vlan(lgr->smcd, lgr->vlan_id);
+ else
+ smc_link_clear(&lgr->lnk[SMC_SINGLE_LINK]);
kfree(lgr);
}
@@ -357,7 +404,8 @@ static void __smc_lgr_terminate(struct smc_link_group *lgr)
lgr->terminating = 1;
if (!list_empty(&lgr->list)) /* forget lgr */
list_del_init(&lgr->list);
- smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]);
+ if (!lgr->is_smcd)
+ smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]);
write_lock_bh(&lgr->conns_lock);
node = rb_first(&lgr->conns_all);
@@ -374,7 +422,8 @@ static void __smc_lgr_terminate(struct smc_link_group *lgr)
node = rb_first(&lgr->conns_all);
}
write_unlock_bh(&lgr->conns_lock);
- wake_up(&lgr->lnk[SMC_SINGLE_LINK].wr_reg_wait);
+ if (!lgr->is_smcd)
+ wake_up(&lgr->lnk[SMC_SINGLE_LINK].wr_reg_wait);
smc_lgr_schedule_free_work(lgr);
}
@@ -392,17 +441,44 @@ void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport)
spin_lock_bh(&smc_lgr_list.lock);
list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) {
- if (lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev &&
+ if (!lgr->is_smcd &&
+ lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev &&
lgr->lnk[SMC_SINGLE_LINK].ibport == ibport)
__smc_lgr_terminate(lgr);
}
spin_unlock_bh(&smc_lgr_list.lock);
}
+/* Called when SMC-D device is terminated or peer is lost */
+void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid)
+{
+ struct smc_link_group *lgr, *l;
+ LIST_HEAD(lgr_free_list);
+
+ /* run common cleanup function and build free list */
+ spin_lock_bh(&smc_lgr_list.lock);
+ list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) {
+ if (lgr->is_smcd && lgr->smcd == dev &&
+ (!peer_gid || lgr->peer_gid == peer_gid) &&
+ !list_empty(&lgr->list)) {
+ __smc_lgr_terminate(lgr);
+ list_move(&lgr->list, &lgr_free_list);
+ }
+ }
+ spin_unlock_bh(&smc_lgr_list.lock);
+
+ /* cancel the regular free workers and actually free lgrs */
+ list_for_each_entry_safe(lgr, l, &lgr_free_list, list) {
+ list_del_init(&lgr->list);
+ cancel_delayed_work_sync(&lgr->free_work);
+ smc_lgr_free(lgr);
+ }
+}
+
/* Determine vlan of internal TCP socket.
* @vlan_id: address to store the determined vlan id into
*/
-static int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id)
+int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id)
{
struct dst_entry *dst = sk_dst_get(clcsock->sk);
struct net_device *ndev;
@@ -477,10 +553,30 @@ static int smc_link_determine_gid(struct smc_link_group *lgr)
return -ENODEV;
}
+static bool smcr_lgr_match(struct smc_link_group *lgr,
+ struct smc_clc_msg_local *lcl,
+ enum smc_lgr_role role)
+{
+ return !memcmp(lgr->peer_systemid, lcl->id_for_peer,
+ SMC_SYSTEMID_LEN) &&
+ !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_gid, &lcl->gid,
+ SMC_GID_SIZE) &&
+ !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_mac, lcl->mac,
+ sizeof(lcl->mac)) &&
+ lgr->role == role;
+}
+
+static bool smcd_lgr_match(struct smc_link_group *lgr,
+ struct smcd_dev *smcismdev, u64 peer_gid)
+{
+ return lgr->peer_gid == peer_gid && lgr->smcd == smcismdev;
+}
+
/* create a new SMC connection (and a new link group if necessary) */
-int smc_conn_create(struct smc_sock *smc,
+int smc_conn_create(struct smc_sock *smc, bool is_smcd, int srv_first_contact,
struct smc_ib_device *smcibdev, u8 ibport,
- struct smc_clc_msg_local *lcl, int srv_first_contact)
+ struct smc_clc_msg_local *lcl, struct smcd_dev *smcd,
+ u64 peer_gid)
{
struct smc_connection *conn = &smc->conn;
int local_contact = SMC_FIRST_CONTACT;
@@ -502,17 +598,12 @@ int smc_conn_create(struct smc_sock *smc,
spin_lock_bh(&smc_lgr_list.lock);
list_for_each_entry(lgr, &smc_lgr_list.list, list) {
write_lock_bh(&lgr->conns_lock);
- if (!memcmp(lgr->peer_systemid, lcl->id_for_peer,
- SMC_SYSTEMID_LEN) &&
- !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_gid, &lcl->gid,
- SMC_GID_SIZE) &&
- !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_mac, lcl->mac,
- sizeof(lcl->mac)) &&
+ if ((is_smcd ? smcd_lgr_match(lgr, smcd, peer_gid) :
+ smcr_lgr_match(lgr, lcl, role)) &&
!lgr->sync_err &&
- (lgr->role == role) &&
- (lgr->vlan_id == vlan_id) &&
- ((role == SMC_CLNT) ||
- (lgr->conns_num < SMC_RMBS_PER_LGR_MAX))) {
+ lgr->vlan_id == vlan_id &&
+ (role == SMC_CLNT ||
+ lgr->conns_num < SMC_RMBS_PER_LGR_MAX)) {
/* link group found */
local_contact = SMC_REUSE_CONTACT;
conn->lgr = lgr;
@@ -535,16 +626,21 @@ int smc_conn_create(struct smc_sock *smc,
create:
if (local_contact == SMC_FIRST_CONTACT) {
- rc = smc_lgr_create(smc, smcibdev, ibport,
- lcl->id_for_peer, vlan_id);
+ rc = smc_lgr_create(smc, is_smcd, smcibdev, ibport,
+ lcl->id_for_peer, vlan_id, smcd, peer_gid);
if (rc)
goto out;
smc_lgr_register_conn(conn); /* add smc conn to lgr */
- rc = smc_link_determine_gid(conn->lgr);
+ if (!is_smcd)
+ rc = smc_link_determine_gid(conn->lgr);
}
conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE;
conn->local_tx_ctrl.len = SMC_WR_TX_SIZE;
conn->urg_state = SMC_URG_READ;
+ if (is_smcd) {
+ conn->rx_off = sizeof(struct smcd_cdc_msg);
+ smcd_cdc_rx_init(conn); /* init tasklet for this conn */
+ }
#ifndef KERNEL_HAS_ATOMIC64
spin_lock_init(&conn->acurs_lock);
#endif
@@ -609,8 +705,8 @@ static inline int smc_rmb_wnd_update_limit(int rmbe_size)
return min_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2);
}
-static struct smc_buf_desc *smc_new_buf_create(struct smc_link_group *lgr,
- bool is_rmb, int bufsize)
+static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr,
+ bool is_rmb, int bufsize)
{
struct smc_buf_desc *buf_desc;
struct smc_link *lnk;
@@ -668,7 +764,44 @@ static struct smc_buf_desc *smc_new_buf_create(struct smc_link_group *lgr,
return buf_desc;
}
-static int __smc_buf_create(struct smc_sock *smc, bool is_rmb)
+#define SMCD_DMBE_SIZES 7 /* 0 -> 16KB, 1 -> 32KB, .. 6 -> 1MB */
+
+static struct smc_buf_desc *smcd_new_buf_create(struct smc_link_group *lgr,
+ bool is_dmb, int bufsize)
+{
+ struct smc_buf_desc *buf_desc;
+ int rc;
+
+ if (smc_compress_bufsize(bufsize) > SMCD_DMBE_SIZES)
+ return ERR_PTR(-EAGAIN);
+
+ /* try to alloc a new DMB */
+ buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL);
+ if (!buf_desc)
+ return ERR_PTR(-ENOMEM);
+ if (is_dmb) {
+ rc = smc_ism_register_dmb(lgr, bufsize, buf_desc);
+ if (rc) {
+ kfree(buf_desc);
+ return ERR_PTR(-EAGAIN);
+ }
+ buf_desc->pages = virt_to_page(buf_desc->cpu_addr);
+ /* CDC header stored in buf. So, pretend it was smaller */
+ buf_desc->len = bufsize - sizeof(struct smcd_cdc_msg);
+ } else {
+ buf_desc->cpu_addr = kzalloc(bufsize, GFP_KERNEL |
+ __GFP_NOWARN | __GFP_NORETRY |
+ __GFP_NOMEMALLOC);
+ if (!buf_desc->cpu_addr) {
+ kfree(buf_desc);
+ return ERR_PTR(-EAGAIN);
+ }
+ buf_desc->len = bufsize;
+ }
+ return buf_desc;
+}
+
+static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
{
struct smc_buf_desc *buf_desc = ERR_PTR(-ENOMEM);
struct smc_connection *conn = &smc->conn;
@@ -706,7 +839,11 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_rmb)
break; /* found reusable slot */
}
- buf_desc = smc_new_buf_create(lgr, is_rmb, bufsize);
+ if (is_smcd)
+ buf_desc = smcd_new_buf_create(lgr, is_rmb, bufsize);
+ else
+ buf_desc = smcr_new_buf_create(lgr, is_rmb, bufsize);
+
if (PTR_ERR(buf_desc) == -ENOMEM)
break;
if (IS_ERR(buf_desc))
@@ -727,7 +864,10 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_rmb)
conn->rmbe_size_short = bufsize_short;
smc->sk.sk_rcvbuf = bufsize * 2;
atomic_set(&conn->bytes_to_rcv, 0);
- conn->rmbe_update_limit = smc_rmb_wnd_update_limit(bufsize);
+ conn->rmbe_update_limit =
+ smc_rmb_wnd_update_limit(buf_desc->len);
+ if (is_smcd)
+ smc_ism_set_conn(conn); /* map RMB/smcd_dev to conn */
} else {
conn->sndbuf_desc = buf_desc;
smc->sk.sk_sndbuf = bufsize * 2;
@@ -740,6 +880,8 @@ void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn)
{
struct smc_link_group *lgr = conn->lgr;
+ if (!conn->lgr || conn->lgr->is_smcd)
+ return;
smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
conn->sndbuf_desc, DMA_TO_DEVICE);
}
@@ -748,6 +890,8 @@ void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn)
{
struct smc_link_group *lgr = conn->lgr;
+ if (!conn->lgr || conn->lgr->is_smcd)
+ return;
smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
conn->sndbuf_desc, DMA_TO_DEVICE);
}
@@ -756,6 +900,8 @@ void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn)
{
struct smc_link_group *lgr = conn->lgr;
+ if (!conn->lgr || conn->lgr->is_smcd)
+ return;
smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
conn->rmb_desc, DMA_FROM_DEVICE);
}
@@ -764,6 +910,8 @@ void smc_rmb_sync_sg_for_device(struct smc_connection *conn)
{
struct smc_link_group *lgr = conn->lgr;
+ if (!conn->lgr || conn->lgr->is_smcd)
+ return;
smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
conn->rmb_desc, DMA_FROM_DEVICE);
}
@@ -774,16 +922,16 @@ void smc_rmb_sync_sg_for_device(struct smc_connection *conn)
* the Linux implementation uses just one RMB-element per RMB, i.e. uses an
* extra RMB for every connection in a link group
*/
-int smc_buf_create(struct smc_sock *smc)
+int smc_buf_create(struct smc_sock *smc, bool is_smcd)
{
int rc;
/* create send buffer */
- rc = __smc_buf_create(smc, false);
+ rc = __smc_buf_create(smc, is_smcd, false);
if (rc)
return rc;
/* create rmb */
- rc = __smc_buf_create(smc, true);
+ rc = __smc_buf_create(smc, is_smcd, true);
if (rc)
smc_buf_free(smc->conn.lgr, false, smc->conn.sndbuf_desc);
return rc;
@@ -865,7 +1013,8 @@ void smc_core_exit(void)
spin_unlock_bh(&smc_lgr_list.lock);
list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) {
list_del_init(&lgr->list);
- smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]);
+ if (!lgr->is_smcd)
+ smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]);
cancel_delayed_work_sync(&lgr->free_work);
smc_lgr_free(lgr); /* free link group */
}
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index 93cb3523bf50..8b47e0168fc3 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -124,15 +124,28 @@ struct smc_buf_desc {
void *cpu_addr; /* virtual address of buffer */
struct page *pages;
int len; /* length of buffer */
- struct sg_table sgt[SMC_LINKS_PER_LGR_MAX];/* virtual buffer */
- struct ib_mr *mr_rx[SMC_LINKS_PER_LGR_MAX];
- /* for rmb only: memory region
- * incl. rkey provided to peer
- */
- u32 order; /* allocation order */
u32 used; /* currently used / unused */
u8 reused : 1; /* new created / reused */
u8 regerr : 1; /* err during registration */
+ union {
+ struct { /* SMC-R */
+ struct sg_table sgt[SMC_LINKS_PER_LGR_MAX];
+ /* virtual buffer */
+ struct ib_mr *mr_rx[SMC_LINKS_PER_LGR_MAX];
+ /* for rmb only: memory region
+ * incl. rkey provided to peer
+ */
+ u32 order; /* allocation order */
+ };
+ struct { /* SMC-D */
+ unsigned short sba_idx;
+ /* SBA index number */
+ u64 token;
+ /* DMB token number */
+ dma_addr_t dma_addr;
+ /* DMA address */
+ };
+ };
};
struct smc_rtoken { /* address/key of remote RMB */
@@ -148,12 +161,10 @@ struct smc_rtoken { /* address/key of remote RMB */
* struct smc_clc_msg_accept_confirm.rmbe_size being a 4 bit value (0..15)
*/
+struct smcd_dev;
+
struct smc_link_group {
struct list_head list;
- enum smc_lgr_role role; /* client or server */
- struct smc_link lnk[SMC_LINKS_PER_LGR_MAX]; /* smc link */
- char peer_systemid[SMC_SYSTEMID_LEN];
- /* unique system_id of peer */
struct rb_root conns_all; /* connection tree */
rwlock_t conns_lock; /* protects conns_all */
unsigned int conns_num; /* current # of connections */
@@ -163,17 +174,35 @@ struct smc_link_group {
rwlock_t sndbufs_lock; /* protects tx buffers */
struct list_head rmbs[SMC_RMBE_SIZES]; /* rx buffers */
rwlock_t rmbs_lock; /* protects rx buffers */
- struct smc_rtoken rtokens[SMC_RMBS_PER_LGR_MAX]
- [SMC_LINKS_PER_LGR_MAX];
- /* remote addr/key pairs */
- unsigned long rtokens_used_mask[BITS_TO_LONGS(
- SMC_RMBS_PER_LGR_MAX)];
- /* used rtoken elements */
u8 id[SMC_LGR_ID_SIZE]; /* unique lgr id */
struct delayed_work free_work; /* delayed freeing of an lgr */
u8 sync_err : 1; /* lgr no longer fits to peer */
u8 terminating : 1;/* lgr is terminating */
+
+ bool is_smcd; /* SMC-R or SMC-D */
+ union {
+ struct { /* SMC-R */
+ enum smc_lgr_role role;
+ /* client or server */
+ struct smc_link lnk[SMC_LINKS_PER_LGR_MAX];
+ /* smc link */
+ char peer_systemid[SMC_SYSTEMID_LEN];
+ /* unique system_id of peer */
+ struct smc_rtoken rtokens[SMC_RMBS_PER_LGR_MAX]
+ [SMC_LINKS_PER_LGR_MAX];
+ /* remote addr/key pairs */
+ unsigned long rtokens_used_mask[BITS_TO_LONGS
+ (SMC_RMBS_PER_LGR_MAX)];
+ /* used rtoken elements */
+ };
+ struct { /* SMC-D */
+ u64 peer_gid;
+ /* Peer GID (remote) */
+ struct smcd_dev *smcd;
+ /* ISM device for VLAN reg. */
+ };
+ };
};
/* Find the connection associated with the given alert token in the link group.
@@ -217,7 +246,8 @@ void smc_lgr_free(struct smc_link_group *lgr);
void smc_lgr_forget(struct smc_link_group *lgr);
void smc_lgr_terminate(struct smc_link_group *lgr);
void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport);
-int smc_buf_create(struct smc_sock *smc);
+void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid);
+int smc_buf_create(struct smc_sock *smc, bool is_smcd);
int smc_uncompress_bufsize(u8 compressed);
int smc_rmb_rtoken_handling(struct smc_connection *conn,
struct smc_clc_msg_accept_confirm *clc);
@@ -227,9 +257,13 @@ void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn);
void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn);
void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn);
void smc_rmb_sync_sg_for_device(struct smc_connection *conn);
+int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id);
+
void smc_conn_free(struct smc_connection *conn);
-int smc_conn_create(struct smc_sock *smc,
+int smc_conn_create(struct smc_sock *smc, bool is_smcd, int srv_first_contact,
struct smc_ib_device *smcibdev, u8 ibport,
- struct smc_clc_msg_local *lcl, int srv_first_contact);
+ struct smc_clc_msg_local *lcl, struct smcd_dev *smcd,
+ u64 peer_gid);
+void smcd_conn_free(struct smc_connection *conn);
void smc_core_exit(void);
#endif
diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c
index 839354402215..6d83eef1b743 100644
--- a/net/smc/smc_diag.c
+++ b/net/smc/smc_diag.c
@@ -136,7 +136,8 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb,
goto errout;
}
- if ((req->diag_ext & (1 << (SMC_DIAG_LGRINFO - 1))) && smc->conn.lgr &&
+ if (smc->conn.lgr && !smc->conn.lgr->is_smcd &&
+ (req->diag_ext & (1 << (SMC_DIAG_LGRINFO - 1))) &&
!list_empty(&smc->conn.lgr->list)) {
struct smc_diag_lgrinfo linfo = {
.role = smc->conn.lgr->role,
@@ -155,6 +156,21 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb,
if (nla_put(skb, SMC_DIAG_LGRINFO, sizeof(linfo), &linfo) < 0)
goto errout;
}
+ if (smc->conn.lgr && smc->conn.lgr->is_smcd &&
+ (req->diag_ext & (1 << (SMC_DIAG_DMBINFO - 1))) &&
+ !list_empty(&smc->conn.lgr->list)) {
+ struct smc_connection *conn = &smc->conn;
+ struct smcd_diag_dmbinfo dinfo = {
+ .linkid = *((u32 *)conn->lgr->id),
+ .peer_gid = conn->lgr->peer_gid,
+ .my_gid = conn->lgr->smcd->local_gid,
+ .token = conn->rmb_desc->token,
+ .peer_token = conn->peer_token
+ };
+
+ if (nla_put(skb, SMC_DIAG_DMBINFO, sizeof(dinfo), &dinfo) < 0)
+ goto errout;
+ }
nlmsg_end(skb, nlh);
return 0;
diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
index 0eed7ab9f28b..36de2fd76170 100644
--- a/net/smc/smc_ib.c
+++ b/net/smc/smc_ib.c
@@ -143,6 +143,62 @@ out:
return rc;
}
+static int smc_ib_fill_gid_and_mac(struct smc_ib_device *smcibdev, u8 ibport)
+{
+ struct ib_gid_attr gattr;
+ int rc;
+
+ rc = ib_query_gid(smcibdev->ibdev, ibport, 0,
+ &smcibdev->gid[ibport - 1], &gattr);
+ if (rc || !gattr.ndev)
+ return -ENODEV;
+
+ memcpy(smcibdev->mac[ibport - 1], gattr.ndev->dev_addr, ETH_ALEN);
+ dev_put(gattr.ndev);
+ return 0;
+}
+
+/* Create an identifier unique for this instance of SMC-R.
+ * The MAC-address of the first active registered IB device
+ * plus a random 2-byte number is used to create this identifier.
+ * This name is delivered to the peer during connection initialization.
+ */
+static inline void smc_ib_define_local_systemid(struct smc_ib_device *smcibdev,
+ u8 ibport)
+{
+ memcpy(&local_systemid[2], &smcibdev->mac[ibport - 1],
+ sizeof(smcibdev->mac[ibport - 1]));
+ get_random_bytes(&local_systemid[0], 2);
+}
+
+bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport)
+{
+ return smcibdev->pattr[ibport - 1].state == IB_PORT_ACTIVE;
+}
+
+static int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport)
+{
+ int rc;
+
+ memset(&smcibdev->pattr[ibport - 1], 0,
+ sizeof(smcibdev->pattr[ibport - 1]));
+ rc = ib_query_port(smcibdev->ibdev, ibport,
+ &smcibdev->pattr[ibport - 1]);
+ if (rc)
+ goto out;
+ /* the SMC protocol requires specification of the RoCE MAC address */
+ rc = smc_ib_fill_gid_and_mac(smcibdev, ibport);
+ if (rc)
+ goto out;
+ if (!strncmp(local_systemid, SMC_LOCAL_SYSTEMID_RESET,
+ sizeof(local_systemid)) &&
+ smc_ib_port_active(smcibdev, ibport))
+ /* create unique system identifier */
+ smc_ib_define_local_systemid(smcibdev, ibport);
+out:
+ return rc;
+}
+
/* process context wrapper for might_sleep smc_ib_remember_port_attr */
static void smc_ib_port_event_work(struct work_struct *work)
{
@@ -370,62 +426,6 @@ void smc_ib_buf_unmap_sg(struct smc_ib_device *smcibdev,
buf_slot->sgt[SMC_SINGLE_LINK].sgl->dma_address = 0;
}
-static int smc_ib_fill_gid_and_mac(struct smc_ib_device *smcibdev, u8 ibport)
-{
- struct ib_gid_attr gattr;
- int rc;
-
- rc = ib_query_gid(smcibdev->ibdev, ibport, 0,
- &smcibdev->gid[ibport - 1], &gattr);
- if (rc || !gattr.ndev)
- return -ENODEV;
-
- memcpy(smcibdev->mac[ibport - 1], gattr.ndev->dev_addr, ETH_ALEN);
- dev_put(gattr.ndev);
- return 0;
-}
-
-/* Create an identifier unique for this instance of SMC-R.
- * The MAC-address of the first active registered IB device
- * plus a random 2-byte number is used to create this identifier.
- * This name is delivered to the peer during connection initialization.
- */
-static inline void smc_ib_define_local_systemid(struct smc_ib_device *smcibdev,
- u8 ibport)
-{
- memcpy(&local_systemid[2], &smcibdev->mac[ibport - 1],
- sizeof(smcibdev->mac[ibport - 1]));
- get_random_bytes(&local_systemid[0], 2);
-}
-
-bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport)
-{
- return smcibdev->pattr[ibport - 1].state == IB_PORT_ACTIVE;
-}
-
-int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport)
-{
- int rc;
-
- memset(&smcibdev->pattr[ibport - 1], 0,
- sizeof(smcibdev->pattr[ibport - 1]));
- rc = ib_query_port(smcibdev->ibdev, ibport,
- &smcibdev->pattr[ibport - 1]);
- if (rc)
- goto out;
- /* the SMC protocol requires specification of the RoCE MAC address */
- rc = smc_ib_fill_gid_and_mac(smcibdev, ibport);
- if (rc)
- goto out;
- if (!strncmp(local_systemid, SMC_LOCAL_SYSTEMID_RESET,
- sizeof(local_systemid)) &&
- smc_ib_port_active(smcibdev, ibport))
- /* create unique system identifier */
- smc_ib_define_local_systemid(smcibdev, ibport);
-out:
- return rc;
-}
-
long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev)
{
struct ib_cq_init_attr cqattr = {
@@ -454,9 +454,6 @@ long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev)
smcibdev->roce_cq_recv = NULL;
goto err;
}
- INIT_IB_EVENT_HANDLER(&smcibdev->event_handler, smcibdev->ibdev,
- smc_ib_global_event_handler);
- ib_register_event_handler(&smcibdev->event_handler);
smc_wr_add_dev(smcibdev);
smcibdev->initialized = 1;
return rc;
@@ -472,7 +469,6 @@ static void smc_ib_cleanup_per_ibdev(struct smc_ib_device *smcibdev)
return;
smcibdev->initialized = 0;
smc_wr_remove_dev(smcibdev);
- ib_unregister_event_handler(&smcibdev->event_handler);
ib_destroy_cq(smcibdev->roce_cq_recv);
ib_destroy_cq(smcibdev->roce_cq_send);
}
@@ -483,6 +479,8 @@ static struct ib_client smc_ib_client;
static void smc_ib_add_dev(struct ib_device *ibdev)
{
struct smc_ib_device *smcibdev;
+ u8 port_cnt;
+ int i;
if (ibdev->node_type != RDMA_NODE_IB_CA)
return;
@@ -498,6 +496,21 @@ static void smc_ib_add_dev(struct ib_device *ibdev)
list_add_tail(&smcibdev->list, &smc_ib_devices.list);
spin_unlock(&smc_ib_devices.lock);
ib_set_client_data(ibdev, &smc_ib_client, smcibdev);
+ INIT_IB_EVENT_HANDLER(&smcibdev->event_handler, smcibdev->ibdev,
+ smc_ib_global_event_handler);
+ ib_register_event_handler(&smcibdev->event_handler);
+
+ /* trigger reading of the port attributes */
+ port_cnt = smcibdev->ibdev->phys_port_cnt;
+ for (i = 0;
+ i < min_t(size_t, port_cnt, SMC_MAX_PORTS);
+ i++) {
+ set_bit(i, &smcibdev->port_event_mask);
+ /* determine pnetids of the port */
+ smc_pnetid_by_dev_port(ibdev->dev.parent, i,
+ smcibdev->pnetid[i]);
+ }
+ schedule_work(&smcibdev->port_event_work);
}
/* callback function for ib_register_client() */
@@ -512,6 +525,7 @@ static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data)
spin_unlock(&smc_ib_devices.lock);
smc_pnet_remove_by_ibdev(smcibdev);
smc_ib_cleanup_per_ibdev(smcibdev);
+ ib_unregister_event_handler(&smcibdev->event_handler);
kfree(smcibdev);
}
diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h
index e90630dadf8e..7c1223c91229 100644
--- a/net/smc/smc_ib.h
+++ b/net/smc/smc_ib.h
@@ -15,6 +15,7 @@
#include <linux/interrupt.h>
#include <linux/if_ether.h>
#include <rdma/ib_verbs.h>
+#include <net/smc.h>
#define SMC_MAX_PORTS 2 /* Max # of ports */
#define SMC_GID_SIZE sizeof(union ib_gid)
@@ -40,6 +41,8 @@ struct smc_ib_device { /* ib-device infos for smc */
char mac[SMC_MAX_PORTS][ETH_ALEN];
/* mac address per port*/
union ib_gid gid[SMC_MAX_PORTS]; /* gid per port */
+ u8 pnetid[SMC_MAX_PORTS][SMC_MAX_PNETID_LEN];
+ /* pnetid per port */
u8 initialized : 1; /* ib dev CQ, evthdl done */
struct work_struct port_event_work;
unsigned long port_event_mask;
@@ -51,7 +54,6 @@ struct smc_link;
int smc_ib_register_client(void) __init;
void smc_ib_unregister_client(void);
bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport);
-int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport);
int smc_ib_buf_map_sg(struct smc_ib_device *smcibdev,
struct smc_buf_desc *buf_slot,
enum dma_data_direction data_direction);
diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c
new file mode 100644
index 000000000000..cfade7fdcc6d
--- /dev/null
+++ b/net/smc/smc_ism.c
@@ -0,0 +1,314 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Shared Memory Communications Direct over ISM devices (SMC-D)
+ *
+ * Functions for ISM device.
+ *
+ * Copyright IBM Corp. 2018
+ */
+
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <asm/page.h>
+
+#include "smc.h"
+#include "smc_core.h"
+#include "smc_ism.h"
+#include "smc_pnet.h"
+
+struct smcd_dev_list smcd_dev_list = {
+ .list = LIST_HEAD_INIT(smcd_dev_list.list),
+ .lock = __SPIN_LOCK_UNLOCKED(smcd_dev_list.lock)
+};
+
+/* Test if an ISM communication is possible. */
+int smc_ism_cantalk(u64 peer_gid, unsigned short vlan_id, struct smcd_dev *smcd)
+{
+ return smcd->ops->query_remote_gid(smcd, peer_gid, vlan_id ? 1 : 0,
+ vlan_id);
+}
+
+int smc_ism_write(struct smcd_dev *smcd, const struct smc_ism_position *pos,
+ void *data, size_t len)
+{
+ int rc;
+
+ rc = smcd->ops->move_data(smcd, pos->token, pos->index, pos->signal,
+ pos->offset, data, len);
+
+ return rc < 0 ? rc : 0;
+}
+
+/* Set a connection using this DMBE. */
+void smc_ism_set_conn(struct smc_connection *conn)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&conn->lgr->smcd->lock, flags);
+ conn->lgr->smcd->conn[conn->rmb_desc->sba_idx] = conn;
+ spin_unlock_irqrestore(&conn->lgr->smcd->lock, flags);
+}
+
+/* Unset a connection using this DMBE. */
+void smc_ism_unset_conn(struct smc_connection *conn)
+{
+ unsigned long flags;
+
+ if (!conn->rmb_desc)
+ return;
+
+ spin_lock_irqsave(&conn->lgr->smcd->lock, flags);
+ conn->lgr->smcd->conn[conn->rmb_desc->sba_idx] = NULL;
+ spin_unlock_irqrestore(&conn->lgr->smcd->lock, flags);
+}
+
+/* Register a VLAN identifier with the ISM device. Use a reference count
+ * and add a VLAN identifier only when the first DMB using this VLAN is
+ * registered.
+ */
+int smc_ism_get_vlan(struct smcd_dev *smcd, unsigned short vlanid)
+{
+ struct smc_ism_vlanid *new_vlan, *vlan;
+ unsigned long flags;
+ int rc = 0;
+
+ if (!vlanid) /* No valid vlan id */
+ return -EINVAL;
+
+ /* create new vlan entry, in case we need it */
+ new_vlan = kzalloc(sizeof(*new_vlan), GFP_KERNEL);
+ if (!new_vlan)
+ return -ENOMEM;
+ new_vlan->vlanid = vlanid;
+ refcount_set(&new_vlan->refcnt, 1);
+
+ /* if there is an existing entry, increase count and return */
+ spin_lock_irqsave(&smcd->lock, flags);
+ list_for_each_entry(vlan, &smcd->vlan, list) {
+ if (vlan->vlanid == vlanid) {
+ refcount_inc(&vlan->refcnt);
+ kfree(new_vlan);
+ goto out;
+ }
+ }
+
+ /* no existing entry found.
+ * add new entry to device; might fail, e.g., if HW limit reached
+ */
+ if (smcd->ops->add_vlan_id(smcd, vlanid)) {
+ kfree(new_vlan);
+ rc = -EIO;
+ goto out;
+ }
+ list_add_tail(&new_vlan->list, &smcd->vlan);
+out:
+ spin_unlock_irqrestore(&smcd->lock, flags);
+ return rc;
+}
+
+/* Unregister a VLAN identifier with the ISM device. Use a reference count
+ * and remove a VLAN identifier only when the last DMB using this VLAN is
+ * unregistered.
+ */
+int smc_ism_put_vlan(struct smcd_dev *smcd, unsigned short vlanid)
+{
+ struct smc_ism_vlanid *vlan;
+ unsigned long flags;
+ bool found = false;
+ int rc = 0;
+
+ if (!vlanid) /* No valid vlan id */
+ return -EINVAL;
+
+ spin_lock_irqsave(&smcd->lock, flags);
+ list_for_each_entry(vlan, &smcd->vlan, list) {
+ if (vlan->vlanid == vlanid) {
+ if (!refcount_dec_and_test(&vlan->refcnt))
+ goto out;
+ found = true;
+ break;
+ }
+ }
+ if (!found) {
+ rc = -ENOENT;
+ goto out; /* VLAN id not in table */
+ }
+
+ /* Found and the last reference just gone */
+ if (smcd->ops->del_vlan_id(smcd, vlanid))
+ rc = -EIO;
+ list_del(&vlan->list);
+ kfree(vlan);
+out:
+ spin_unlock_irqrestore(&smcd->lock, flags);
+ return rc;
+}
+
+int smc_ism_unregister_dmb(struct smcd_dev *smcd, struct smc_buf_desc *dmb_desc)
+{
+ struct smcd_dmb dmb;
+
+ memset(&dmb, 0, sizeof(dmb));
+ dmb.dmb_tok = dmb_desc->token;
+ dmb.sba_idx = dmb_desc->sba_idx;
+ dmb.cpu_addr = dmb_desc->cpu_addr;
+ dmb.dma_addr = dmb_desc->dma_addr;
+ dmb.dmb_len = dmb_desc->len;
+ return smcd->ops->unregister_dmb(smcd, &dmb);
+}
+
+int smc_ism_register_dmb(struct smc_link_group *lgr, int dmb_len,
+ struct smc_buf_desc *dmb_desc)
+{
+ struct smcd_dmb dmb;
+ int rc;
+
+ memset(&dmb, 0, sizeof(dmb));
+ dmb.dmb_len = dmb_len;
+ dmb.sba_idx = dmb_desc->sba_idx;
+ dmb.vlan_id = lgr->vlan_id;
+ dmb.rgid = lgr->peer_gid;
+ rc = lgr->smcd->ops->register_dmb(lgr->smcd, &dmb);
+ if (!rc) {
+ dmb_desc->sba_idx = dmb.sba_idx;
+ dmb_desc->token = dmb.dmb_tok;
+ dmb_desc->cpu_addr = dmb.cpu_addr;
+ dmb_desc->dma_addr = dmb.dma_addr;
+ dmb_desc->len = dmb.dmb_len;
+ }
+ return rc;
+}
+
+struct smc_ism_event_work {
+ struct work_struct work;
+ struct smcd_dev *smcd;
+ struct smcd_event event;
+};
+
+/* worker for SMC-D events */
+static void smc_ism_event_work(struct work_struct *work)
+{
+ struct smc_ism_event_work *wrk =
+ container_of(work, struct smc_ism_event_work, work);
+
+ switch (wrk->event.type) {
+ case ISM_EVENT_GID: /* GID event, token is peer GID */
+ smc_smcd_terminate(wrk->smcd, wrk->event.tok);
+ break;
+ case ISM_EVENT_DMB:
+ break;
+ }
+ kfree(wrk);
+}
+
+static void smcd_release(struct device *dev)
+{
+ struct smcd_dev *smcd = container_of(dev, struct smcd_dev, dev);
+
+ kfree(smcd->conn);
+ kfree(smcd);
+}
+
+struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name,
+ const struct smcd_ops *ops, int max_dmbs)
+{
+ struct smcd_dev *smcd;
+
+ smcd = kzalloc(sizeof(*smcd), GFP_KERNEL);
+ if (!smcd)
+ return NULL;
+ smcd->conn = kcalloc(max_dmbs, sizeof(struct smc_connection *),
+ GFP_KERNEL);
+ if (!smcd->conn) {
+ kfree(smcd);
+ return NULL;
+ }
+
+ smcd->dev.parent = parent;
+ smcd->dev.release = smcd_release;
+ device_initialize(&smcd->dev);
+ dev_set_name(&smcd->dev, name);
+ smcd->ops = ops;
+ smc_pnetid_by_dev_port(parent, 0, smcd->pnetid);
+
+ spin_lock_init(&smcd->lock);
+ INIT_LIST_HEAD(&smcd->vlan);
+ smcd->event_wq = alloc_ordered_workqueue("ism_evt_wq-%s)",
+ WQ_MEM_RECLAIM, name);
+ return smcd;
+}
+EXPORT_SYMBOL_GPL(smcd_alloc_dev);
+
+int smcd_register_dev(struct smcd_dev *smcd)
+{
+ spin_lock(&smcd_dev_list.lock);
+ list_add_tail(&smcd->list, &smcd_dev_list.list);
+ spin_unlock(&smcd_dev_list.lock);
+
+ return device_add(&smcd->dev);
+}
+EXPORT_SYMBOL_GPL(smcd_register_dev);
+
+void smcd_unregister_dev(struct smcd_dev *smcd)
+{
+ spin_lock(&smcd_dev_list.lock);
+ list_del(&smcd->list);
+ spin_unlock(&smcd_dev_list.lock);
+ flush_workqueue(smcd->event_wq);
+ destroy_workqueue(smcd->event_wq);
+ smc_smcd_terminate(smcd, 0);
+
+ device_del(&smcd->dev);
+}
+EXPORT_SYMBOL_GPL(smcd_unregister_dev);
+
+void smcd_free_dev(struct smcd_dev *smcd)
+{
+ put_device(&smcd->dev);
+}
+EXPORT_SYMBOL_GPL(smcd_free_dev);
+
+/* SMCD Device event handler. Called from ISM device interrupt handler.
+ * Parameters are smcd device pointer,
+ * - event->type (0 --> DMB, 1 --> GID),
+ * - event->code (event code),
+ * - event->tok (either DMB token when event type 0, or GID when event type 1)
+ * - event->time (time of day)
+ * - event->info (debug info).
+ *
+ * Context:
+ * - Function called in IRQ context from ISM device driver event handler.
+ */
+void smcd_handle_event(struct smcd_dev *smcd, struct smcd_event *event)
+{
+ struct smc_ism_event_work *wrk;
+
+ /* copy event to event work queue, and let it be handled there */
+ wrk = kmalloc(sizeof(*wrk), GFP_ATOMIC);
+ if (!wrk)
+ return;
+ INIT_WORK(&wrk->work, smc_ism_event_work);
+ wrk->smcd = smcd;
+ wrk->event = *event;
+ queue_work(smcd->event_wq, &wrk->work);
+}
+EXPORT_SYMBOL_GPL(smcd_handle_event);
+
+/* SMCD Device interrupt handler. Called from ISM device interrupt handler.
+ * Parameters are smcd device pointer and DMB number. Find the connection and
+ * schedule the tasklet for this connection.
+ *
+ * Context:
+ * - Function called in IRQ context from ISM device driver IRQ handler.
+ */
+void smcd_handle_irq(struct smcd_dev *smcd, unsigned int dmbno)
+{
+ struct smc_connection *conn = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&smcd->lock, flags);
+ conn = smcd->conn[dmbno];
+ if (conn)
+ tasklet_schedule(&conn->rx_tsklet);
+ spin_unlock_irqrestore(&smcd->lock, flags);
+}
+EXPORT_SYMBOL_GPL(smcd_handle_irq);
diff --git a/net/smc/smc_ism.h b/net/smc/smc_ism.h
new file mode 100644
index 000000000000..aee45b860b79
--- /dev/null
+++ b/net/smc/smc_ism.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Shared Memory Communications Direct over ISM devices (SMC-D)
+ *
+ * SMC-D ISM device structure definitions.
+ *
+ * Copyright IBM Corp. 2018
+ */
+
+#ifndef SMCD_ISM_H
+#define SMCD_ISM_H
+
+#include <linux/uio.h>
+
+#include "smc.h"
+
+struct smcd_dev_list { /* List of SMCD devices */
+ struct list_head list;
+ spinlock_t lock; /* Protects list of devices */
+};
+
+extern struct smcd_dev_list smcd_dev_list; /* list of smcd devices */
+
+struct smc_ism_vlanid { /* VLAN id set on ISM device */
+ struct list_head list;
+ unsigned short vlanid; /* Vlan id */
+ refcount_t refcnt; /* Reference count */
+};
+
+struct smc_ism_position { /* ISM device position to write to */
+ u64 token; /* Token of DMB */
+ u32 offset; /* Offset into DMBE */
+ u8 index; /* Index of DMBE */
+ u8 signal; /* Generate interrupt on owner side */
+};
+
+struct smcd_dev;
+
+int smc_ism_cantalk(u64 peer_gid, unsigned short vlan_id, struct smcd_dev *dev);
+void smc_ism_set_conn(struct smc_connection *conn);
+void smc_ism_unset_conn(struct smc_connection *conn);
+int smc_ism_get_vlan(struct smcd_dev *dev, unsigned short vlan_id);
+int smc_ism_put_vlan(struct smcd_dev *dev, unsigned short vlan_id);
+int smc_ism_register_dmb(struct smc_link_group *lgr, int buf_size,
+ struct smc_buf_desc *dmb_desc);
+int smc_ism_unregister_dmb(struct smcd_dev *dev, struct smc_buf_desc *dmb_desc);
+int smc_ism_write(struct smcd_dev *dev, const struct smc_ism_position *pos,
+ void *data, size_t len);
+#endif
diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c
index d7b88b2d1b22..1b6c066d3495 100644
--- a/net/smc/smc_pnet.c
+++ b/net/smc/smc_pnet.c
@@ -22,13 +22,12 @@
#include "smc_pnet.h"
#include "smc_ib.h"
-
-#define SMC_MAX_PNET_ID_LEN 16 /* Max. length of PNET id */
+#include "smc_ism.h"
static struct nla_policy smc_pnet_policy[SMC_PNETID_MAX + 1] = {
[SMC_PNETID_NAME] = {
.type = NLA_NUL_STRING,
- .len = SMC_MAX_PNET_ID_LEN - 1
+ .len = SMC_MAX_PNETID_LEN - 1
},
[SMC_PNETID_ETHNAME] = {
.type = NLA_NUL_STRING,
@@ -65,7 +64,7 @@ static struct smc_pnettable {
*/
struct smc_pnetentry {
struct list_head list;
- char pnet_name[SMC_MAX_PNET_ID_LEN + 1];
+ char pnet_name[SMC_MAX_PNETID_LEN + 1];
struct net_device *ndev;
struct smc_ib_device *smcibdev;
u8 ib_port;
@@ -209,7 +208,7 @@ static bool smc_pnetid_valid(const char *pnet_name, char *pnetid)
return false;
while (--end >= bf && isspace(*end))
;
- if (end - bf >= SMC_MAX_PNET_ID_LEN)
+ if (end - bf >= SMC_MAX_PNETID_LEN)
return false;
while (bf <= end) {
if (!isalnum(*bf))
@@ -358,9 +357,6 @@ static int smc_pnet_add(struct sk_buff *skb, struct genl_info *info)
kfree(pnetelem);
return rc;
}
- rc = smc_ib_remember_port_attr(pnetelem->smcibdev, pnetelem->ib_port);
- if (rc)
- smc_pnet_remove_by_pnetid(pnetelem->pnet_name);
return rc;
}
@@ -485,10 +481,10 @@ static int smc_pnet_netdev_event(struct notifier_block *this,
case NETDEV_REBOOT:
case NETDEV_UNREGISTER:
smc_pnet_remove_by_ndev(event_dev);
+ return NOTIFY_OK;
default:
- break;
+ return NOTIFY_DONE;
}
- return NOTIFY_DONE;
}
static struct notifier_block smc_netdev_notifier = {
@@ -515,26 +511,91 @@ void smc_pnet_exit(void)
genl_unregister_family(&smc_pnet_nl_family);
}
-/* PNET table analysis for a given sock:
- * determine ib_device and port belonging to used internal TCP socket
- * ethernet interface.
+/* Determine one base device for stacked net devices.
+ * If the lower device level contains more than one devices
+ * (for instance with bonding slaves), just the first device
+ * is used to reach a base device.
*/
-void smc_pnet_find_roce_resource(struct sock *sk,
- struct smc_ib_device **smcibdev, u8 *ibport)
+static struct net_device *pnet_find_base_ndev(struct net_device *ndev)
{
- struct dst_entry *dst = sk_dst_get(sk);
- struct smc_pnetentry *pnetelem;
+ int i, nest_lvl;
- *smcibdev = NULL;
- *ibport = 0;
+ rtnl_lock();
+ nest_lvl = dev_get_nest_level(ndev);
+ for (i = 0; i < nest_lvl; i++) {
+ struct list_head *lower = &ndev->adj_list.lower;
+
+ if (list_empty(lower))
+ break;
+ lower = lower->next;
+ ndev = netdev_lower_get_next(ndev, &lower);
+ }
+ rtnl_unlock();
+ return ndev;
+}
+
+/* Determine the corresponding IB device port based on the hardware PNETID.
+ * Searching stops at the first matching active IB device port.
+ */
+static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev,
+ struct smc_ib_device **smcibdev,
+ u8 *ibport)
+{
+ u8 ndev_pnetid[SMC_MAX_PNETID_LEN];
+ struct smc_ib_device *ibdev;
+ int i;
+
+ ndev = pnet_find_base_ndev(ndev);
+ if (smc_pnetid_by_dev_port(ndev->dev.parent, ndev->dev_port,
+ ndev_pnetid))
+ return; /* pnetid could not be determined */
+
+ spin_lock(&smc_ib_devices.lock);
+ list_for_each_entry(ibdev, &smc_ib_devices.list, list) {
+ for (i = 1; i <= SMC_MAX_PORTS; i++) {
+ if (!memcmp(ibdev->pnetid[i - 1], ndev_pnetid,
+ SMC_MAX_PNETID_LEN) &&
+ smc_ib_port_active(ibdev, i)) {
+ *smcibdev = ibdev;
+ *ibport = i;
+ break;
+ }
+ }
+ }
+ spin_unlock(&smc_ib_devices.lock);
+}
+
+static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev,
+ struct smcd_dev **smcismdev)
+{
+ u8 ndev_pnetid[SMC_MAX_PNETID_LEN];
+ struct smcd_dev *ismdev;
+
+ ndev = pnet_find_base_ndev(ndev);
+ if (smc_pnetid_by_dev_port(ndev->dev.parent, ndev->dev_port,
+ ndev_pnetid))
+ return; /* pnetid could not be determined */
+
+ spin_lock(&smcd_dev_list.lock);
+ list_for_each_entry(ismdev, &smcd_dev_list.list, list) {
+ if (!memcmp(ismdev->pnetid, ndev_pnetid, SMC_MAX_PNETID_LEN)) {
+ *smcismdev = ismdev;
+ break;
+ }
+ }
+ spin_unlock(&smcd_dev_list.lock);
+}
+
+/* Lookup of coupled ib_device via SMC pnet table */
+static void smc_pnet_find_roce_by_table(struct net_device *netdev,
+ struct smc_ib_device **smcibdev,
+ u8 *ibport)
+{
+ struct smc_pnetentry *pnetelem;
- if (!dst)
- return;
- if (!dst->dev)
- goto out_rel;
read_lock(&smc_pnettable.lock);
list_for_each_entry(pnetelem, &smc_pnettable.pnetlist, list) {
- if (dst->dev == pnetelem->ndev) {
+ if (netdev == pnetelem->ndev) {
if (smc_ib_port_active(pnetelem->smcibdev,
pnetelem->ib_port)) {
*smcibdev = pnetelem->smcibdev;
@@ -544,6 +605,54 @@ void smc_pnet_find_roce_resource(struct sock *sk,
}
}
read_unlock(&smc_pnettable.lock);
+}
+
+/* PNET table analysis for a given sock:
+ * determine ib_device and port belonging to used internal TCP socket
+ * ethernet interface.
+ */
+void smc_pnet_find_roce_resource(struct sock *sk,
+ struct smc_ib_device **smcibdev, u8 *ibport)
+{
+ struct dst_entry *dst = sk_dst_get(sk);
+
+ *smcibdev = NULL;
+ *ibport = 0;
+
+ if (!dst)
+ goto out;
+ if (!dst->dev)
+ goto out_rel;
+
+ /* if possible, lookup via hardware-defined pnetid */
+ smc_pnet_find_roce_by_pnetid(dst->dev, smcibdev, ibport);
+ if (*smcibdev)
+ goto out_rel;
+
+ /* lookup via SMC PNET table */
+ smc_pnet_find_roce_by_table(dst->dev, smcibdev, ibport);
+
+out_rel:
+ dst_release(dst);
+out:
+ return;
+}
+
+void smc_pnet_find_ism_resource(struct sock *sk, struct smcd_dev **smcismdev)
+{
+ struct dst_entry *dst = sk_dst_get(sk);
+
+ *smcismdev = NULL;
+ if (!dst)
+ goto out;
+ if (!dst->dev)
+ goto out_rel;
+
+ /* if possible, lookup via hardware-defined pnetid */
+ smc_pnet_find_ism_by_pnetid(dst->dev, smcismdev);
+
out_rel:
dst_release(dst);
+out:
+ return;
}
diff --git a/net/smc/smc_pnet.h b/net/smc/smc_pnet.h
index 5a29519db976..1e94fd4df7bc 100644
--- a/net/smc/smc_pnet.h
+++ b/net/smc/smc_pnet.h
@@ -12,12 +12,28 @@
#ifndef _SMC_PNET_H
#define _SMC_PNET_H
+#if IS_ENABLED(CONFIG_HAVE_PNETID)
+#include <asm/pnet.h>
+#endif
+
struct smc_ib_device;
+struct smcd_dev;
+
+static inline int smc_pnetid_by_dev_port(struct device *dev,
+ unsigned short port, u8 *pnetid)
+{
+#if IS_ENABLED(CONFIG_HAVE_PNETID)
+ return pnet_id_by_dev_port(dev, port, pnetid);
+#else
+ return -ENOENT;
+#endif
+}
int smc_pnet_init(void) __init;
void smc_pnet_exit(void);
int smc_pnet_remove_by_ibdev(struct smc_ib_device *ibdev);
void smc_pnet_find_roce_resource(struct sock *sk,
struct smc_ib_device **smcibdev, u8 *ibport);
+void smc_pnet_find_ism_resource(struct sock *sk, struct smcd_dev **smcismdev);
#endif
diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c
index 3d77b383cccd..b329803c8339 100644
--- a/net/smc/smc_rx.c
+++ b/net/smc/smc_rx.c
@@ -305,7 +305,7 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg,
target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
/* we currently use 1 RMBE per RMB, so RMBE == RMB base addr */
- rcvbuf_base = conn->rmb_desc->cpu_addr;
+ rcvbuf_base = conn->rx_off + conn->rmb_desc->cpu_addr;
do { /* while (read_remaining) */
if (read_done >= target || (pipe && read_done))
diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c
index f82886b7d1d8..142bcb134dd6 100644
--- a/net/smc/smc_tx.c
+++ b/net/smc/smc_tx.c
@@ -24,6 +24,7 @@
#include "smc.h"
#include "smc_wr.h"
#include "smc_cdc.h"
+#include "smc_ism.h"
#include "smc_tx.h"
#define SMC_TX_WORK_DELAY HZ
@@ -250,6 +251,24 @@ out_err:
/***************************** sndbuf consumer *******************************/
+/* sndbuf consumer: actual data transfer of one target chunk with ISM write */
+int smcd_tx_ism_write(struct smc_connection *conn, void *data, size_t len,
+ u32 offset, int signal)
+{
+ struct smc_ism_position pos;
+ int rc;
+
+ memset(&pos, 0, sizeof(pos));
+ pos.token = conn->peer_token;
+ pos.index = conn->peer_rmbe_idx;
+ pos.offset = conn->tx_off + offset;
+ pos.signal = signal;
+ rc = smc_ism_write(conn->lgr->smcd, &pos, data, len);
+ if (rc)
+ conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
+ return rc;
+}
+
/* sndbuf consumer: actual data transfer of one target chunk with RDMA write */
static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset,
int num_sges, struct ib_sge sges[])
@@ -297,21 +316,104 @@ static inline void smc_tx_advance_cursors(struct smc_connection *conn,
smc_curs_add(conn->sndbuf_desc->len, sent, len);
}
+/* SMC-R helper for smc_tx_rdma_writes() */
+static int smcr_tx_rdma_writes(struct smc_connection *conn, size_t len,
+ size_t src_off, size_t src_len,
+ size_t dst_off, size_t dst_len)
+{
+ dma_addr_t dma_addr =
+ sg_dma_address(conn->sndbuf_desc->sgt[SMC_SINGLE_LINK].sgl);
+ struct smc_link *link = &conn->lgr->lnk[SMC_SINGLE_LINK];
+ int src_len_sum = src_len, dst_len_sum = dst_len;
+ struct ib_sge sges[SMC_IB_MAX_SEND_SGE];
+ int sent_count = src_off;
+ int srcchunk, dstchunk;
+ int num_sges;
+ int rc;
+
+ for (dstchunk = 0; dstchunk < 2; dstchunk++) {
+ num_sges = 0;
+ for (srcchunk = 0; srcchunk < 2; srcchunk++) {
+ sges[srcchunk].addr = dma_addr + src_off;
+ sges[srcchunk].length = src_len;
+ sges[srcchunk].lkey = link->roce_pd->local_dma_lkey;
+ num_sges++;
+
+ src_off += src_len;
+ if (src_off >= conn->sndbuf_desc->len)
+ src_off -= conn->sndbuf_desc->len;
+ /* modulo in send ring */
+ if (src_len_sum == dst_len)
+ break; /* either on 1st or 2nd iteration */
+ /* prepare next (== 2nd) iteration */
+ src_len = dst_len - src_len; /* remainder */
+ src_len_sum += src_len;
+ }
+ rc = smc_tx_rdma_write(conn, dst_off, num_sges, sges);
+ if (rc)
+ return rc;
+ if (dst_len_sum == len)
+ break; /* either on 1st or 2nd iteration */
+ /* prepare next (== 2nd) iteration */
+ dst_off = 0; /* modulo offset in RMBE ring buffer */
+ dst_len = len - dst_len; /* remainder */
+ dst_len_sum += dst_len;
+ src_len = min_t(int, dst_len, conn->sndbuf_desc->len -
+ sent_count);
+ src_len_sum = src_len;
+ }
+ return 0;
+}
+
+/* SMC-D helper for smc_tx_rdma_writes() */
+static int smcd_tx_rdma_writes(struct smc_connection *conn, size_t len,
+ size_t src_off, size_t src_len,
+ size_t dst_off, size_t dst_len)
+{
+ int src_len_sum = src_len, dst_len_sum = dst_len;
+ int srcchunk, dstchunk;
+ int rc;
+
+ for (dstchunk = 0; dstchunk < 2; dstchunk++) {
+ for (srcchunk = 0; srcchunk < 2; srcchunk++) {
+ void *data = conn->sndbuf_desc->cpu_addr + src_off;
+
+ rc = smcd_tx_ism_write(conn, data, src_len, dst_off +
+ sizeof(struct smcd_cdc_msg), 0);
+ if (rc)
+ return rc;
+ dst_off += src_len;
+ src_off += src_len;
+ if (src_off >= conn->sndbuf_desc->len)
+ src_off -= conn->sndbuf_desc->len;
+ /* modulo in send ring */
+ if (src_len_sum == dst_len)
+ break; /* either on 1st or 2nd iteration */
+ /* prepare next (== 2nd) iteration */
+ src_len = dst_len - src_len; /* remainder */
+ src_len_sum += src_len;
+ }
+ if (dst_len_sum == len)
+ break; /* either on 1st or 2nd iteration */
+ /* prepare next (== 2nd) iteration */
+ dst_off = 0; /* modulo offset in RMBE ring buffer */
+ dst_len = len - dst_len; /* remainder */
+ dst_len_sum += dst_len;
+ src_len = min_t(int, dst_len, conn->sndbuf_desc->len - src_off);
+ src_len_sum = src_len;
+ }
+ return 0;
+}
+
/* sndbuf consumer: prepare all necessary (src&dst) chunks of data transmit;
* usable snd_wnd as max transmit
*/
static int smc_tx_rdma_writes(struct smc_connection *conn)
{
- size_t src_off, src_len, dst_off, dst_len; /* current chunk values */
- size_t len, dst_len_sum, src_len_sum, dstchunk, srcchunk;
+ size_t len, src_len, dst_off, dst_len; /* current chunk values */
union smc_host_cursor sent, prep, prod, cons;
- struct ib_sge sges[SMC_IB_MAX_SEND_SGE];
- struct smc_link_group *lgr = conn->lgr;
struct smc_cdc_producer_flags *pflags;
int to_send, rmbespace;
- struct smc_link *link;
- dma_addr_t dma_addr;
- int num_sges;
int rc;
/* source: sndbuf */
@@ -341,7 +443,6 @@ static int smc_tx_rdma_writes(struct smc_connection *conn)
len = min(to_send, rmbespace);
/* initialize variables for first iteration of subsequent nested loop */
- link = &lgr->lnk[SMC_SINGLE_LINK];
dst_off = prod.count;
if (prod.wrap == cons.wrap) {
/* the filled destination area is unwrapped,
@@ -358,8 +459,6 @@ static int smc_tx_rdma_writes(struct smc_connection *conn)
*/
dst_len = len;
}
- dst_len_sum = dst_len;
- src_off = sent.count;
/* dst_len determines the maximum src_len */
if (sent.count + dst_len <= conn->sndbuf_desc->len) {
/* unwrapped src case: single chunk of entire dst_len */
@@ -368,38 +467,15 @@ static int smc_tx_rdma_writes(struct smc_connection *conn)
/* wrapped src case: 2 chunks of sum dst_len; start with 1st: */
src_len = conn->sndbuf_desc->len - sent.count;
}
- src_len_sum = src_len;
- dma_addr = sg_dma_address(conn->sndbuf_desc->sgt[SMC_SINGLE_LINK].sgl);
- for (dstchunk = 0; dstchunk < 2; dstchunk++) {
- num_sges = 0;
- for (srcchunk = 0; srcchunk < 2; srcchunk++) {
- sges[srcchunk].addr = dma_addr + src_off;
- sges[srcchunk].length = src_len;
- sges[srcchunk].lkey = link->roce_pd->local_dma_lkey;
- num_sges++;
- src_off += src_len;
- if (src_off >= conn->sndbuf_desc->len)
- src_off -= conn->sndbuf_desc->len;
- /* modulo in send ring */
- if (src_len_sum == dst_len)
- break; /* either on 1st or 2nd iteration */
- /* prepare next (== 2nd) iteration */
- src_len = dst_len - src_len; /* remainder */
- src_len_sum += src_len;
- }
- rc = smc_tx_rdma_write(conn, dst_off, num_sges, sges);
- if (rc)
- return rc;
- if (dst_len_sum == len)
- break; /* either on 1st or 2nd iteration */
- /* prepare next (== 2nd) iteration */
- dst_off = 0; /* modulo offset in RMBE ring buffer */
- dst_len = len - dst_len; /* remainder */
- dst_len_sum += dst_len;
- src_len = min_t(int,
- dst_len, conn->sndbuf_desc->len - sent.count);
- src_len_sum = src_len;
- }
+
+ if (conn->lgr->is_smcd)
+ rc = smcd_tx_rdma_writes(conn, len, sent.count, src_len,
+ dst_off, dst_len);
+ else
+ rc = smcr_tx_rdma_writes(conn, len, sent.count, src_len,
+ dst_off, dst_len);
+ if (rc)
+ return rc;
if (conn->urg_tx_pend && len == to_send)
pflags->urg_data_present = 1;
@@ -420,7 +496,7 @@ static int smc_tx_rdma_writes(struct smc_connection *conn)
/* Wakeup sndbuf consumers from any context (IRQ or process)
* since there is more data to transmit; usable snd_wnd as max transmit
*/
-int smc_tx_sndbuf_nonempty(struct smc_connection *conn)
+static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn)
{
struct smc_cdc_producer_flags *pflags;
struct smc_cdc_tx_pend *pend;
@@ -467,6 +543,37 @@ out_unlock:
return rc;
}
+static int smcd_tx_sndbuf_nonempty(struct smc_connection *conn)
+{
+ struct smc_cdc_producer_flags *pflags = &conn->local_tx_ctrl.prod_flags;
+ int rc = 0;
+
+ spin_lock_bh(&conn->send_lock);
+ if (!pflags->urg_data_present)
+ rc = smc_tx_rdma_writes(conn);
+ if (!rc)
+ rc = smcd_cdc_msg_send(conn);
+
+ if (!rc && pflags->urg_data_present) {
+ pflags->urg_data_pending = 0;
+ pflags->urg_data_present = 0;
+ }
+ spin_unlock_bh(&conn->send_lock);
+ return rc;
+}
+
+int smc_tx_sndbuf_nonempty(struct smc_connection *conn)
+{
+ int rc;
+
+ if (conn->lgr->is_smcd)
+ rc = smcd_tx_sndbuf_nonempty(conn);
+ else
+ rc = smcr_tx_sndbuf_nonempty(conn);
+
+ return rc;
+}
+
/* Wakeup sndbuf consumers from process context
* since there is more data to transmit
*/
diff --git a/net/smc/smc_tx.h b/net/smc/smc_tx.h
index 9d2238909fa0..b22bdc5694c4 100644
--- a/net/smc/smc_tx.h
+++ b/net/smc/smc_tx.h
@@ -33,5 +33,7 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len);
int smc_tx_sndbuf_nonempty(struct smc_connection *conn);
void smc_tx_sndbuf_nonfull(struct smc_sock *smc);
void smc_tx_consumer_update(struct smc_connection *conn, bool force);
+int smcd_tx_ism_write(struct smc_connection *conn, void *data, size_t len,
+ u32 offset, int signal);
#endif /* SMC_TX_H */
diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c
index 625acb27efcc..3a512936eea9 100644
--- a/net/strparser/strparser.c
+++ b/net/strparser/strparser.c
@@ -140,11 +140,13 @@ static int __strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
/* We are going to append to the frags_list of head.
* Need to unshare the frag_list.
*/
- err = skb_unclone(head, GFP_ATOMIC);
- if (err) {
- STRP_STATS_INCR(strp->stats.mem_fail);
- desc->error = err;
- return 0;
+ if (skb_has_frag_list(head)) {
+ err = skb_unclone(head, GFP_ATOMIC);
+ if (err) {
+ STRP_STATS_INCR(strp->stats.mem_fail);
+ desc->error = err;
+ return 0;
+ }
}
if (unlikely(skb_shinfo(head)->frag_list)) {
@@ -201,14 +203,16 @@ static int __strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
memset(stm, 0, sizeof(*stm));
stm->strp.offset = orig_offset + eaten;
} else {
- /* Unclone since we may be appending to an skb that we
+ /* Unclone if we are appending to an skb that we
* already share a frag_list with.
*/
- err = skb_unclone(skb, GFP_ATOMIC);
- if (err) {
- STRP_STATS_INCR(strp->stats.mem_fail);
- desc->error = err;
- break;
+ if (skb_has_frag_list(skb)) {
+ err = skb_unclone(skb, GFP_ATOMIC);
+ if (err) {
+ STRP_STATS_INCR(strp->stats.mem_fail);
+ desc->error = err;
+ break;
+ }
}
stm = _strp_msg(head);
diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index 2dfb492a7c94..fd6d8f18955c 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -610,6 +610,7 @@ static int tipc_l2_device_event(struct notifier_block *nb, unsigned long evt,
case NETDEV_CHANGE:
if (netif_carrier_ok(dev))
break;
+ /* else: fall through */
case NETDEV_UP:
test_and_set_bit_lock(0, &b->up);
break;
diff --git a/net/tipc/group.c b/net/tipc/group.c
index d7a7befeddd4..8f43e7d6046b 100644
--- a/net/tipc/group.c
+++ b/net/tipc/group.c
@@ -159,11 +159,6 @@ u32 tipc_group_exclude(struct tipc_group *grp)
return 0;
}
-int tipc_group_size(struct tipc_group *grp)
-{
- return grp->member_cnt;
-}
-
struct tipc_group *tipc_group_create(struct net *net, u32 portid,
struct tipc_group_req *mreq,
bool *group_is_open)
@@ -918,3 +913,35 @@ void tipc_group_member_evt(struct tipc_group *grp,
}
*sk_rcvbuf = tipc_group_rcvbuf_limit(grp);
}
+
+int tipc_group_fill_sock_diag(struct tipc_group *grp, struct sk_buff *skb)
+{
+ struct nlattr *group = nla_nest_start(skb, TIPC_NLA_SOCK_GROUP);
+
+ if (nla_put_u32(skb, TIPC_NLA_SOCK_GROUP_ID,
+ grp->type) ||
+ nla_put_u32(skb, TIPC_NLA_SOCK_GROUP_INSTANCE,
+ grp->instance) ||
+ nla_put_u32(skb, TIPC_NLA_SOCK_GROUP_BC_SEND_NEXT,
+ grp->bc_snd_nxt))
+ goto group_msg_cancel;
+
+ if (grp->scope == TIPC_NODE_SCOPE)
+ if (nla_put_flag(skb, TIPC_NLA_SOCK_GROUP_NODE_SCOPE))
+ goto group_msg_cancel;
+
+ if (grp->scope == TIPC_CLUSTER_SCOPE)
+ if (nla_put_flag(skb, TIPC_NLA_SOCK_GROUP_CLUSTER_SCOPE))
+ goto group_msg_cancel;
+
+ if (*grp->open)
+ if (nla_put_flag(skb, TIPC_NLA_SOCK_GROUP_OPEN))
+ goto group_msg_cancel;
+
+ nla_nest_end(skb, group);
+ return 0;
+
+group_msg_cancel:
+ nla_nest_cancel(skb, group);
+ return -1;
+}
diff --git a/net/tipc/group.h b/net/tipc/group.h
index 5996af6e9f1d..76b4e5a7b39d 100644
--- a/net/tipc/group.h
+++ b/net/tipc/group.h
@@ -72,4 +72,5 @@ void tipc_group_update_rcv_win(struct tipc_group *grp, int blks, u32 node,
u32 port, struct sk_buff_head *xmitq);
u16 tipc_group_bc_snd_nxt(struct tipc_group *grp);
void tipc_group_update_member(struct tipc_member *m, int len);
+int tipc_group_fill_sock_diag(struct tipc_group *grp, struct sk_buff *skb);
#endif
diff --git a/net/tipc/link.c b/net/tipc/link.c
index 695acb783969..6987ffc8e7a1 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -106,7 +106,8 @@ struct tipc_stats {
* @backlogq: queue for messages waiting to be sent
* @snt_nxt: next sequence number to use for outbound messages
* @last_retransmitted: sequence number of most recently retransmitted message
- * @stale_count: # of identical retransmit requests made by peer
+ * @stale_cnt: counter for number of identical retransmit attempts
+ * @stale_limit: time when repeated identical retransmits must force link reset
* @ackers: # of peers that needs to ack each packet before it can be released
* @acked: # last packet acked by a certain peer. Used for broadcast.
* @rcv_nxt: next sequence number to expect for inbound messages
@@ -127,14 +128,17 @@ struct tipc_link {
struct net *net;
/* Management and link supervision data */
- u32 peer_session;
- u32 session;
+ u16 peer_session;
+ u16 session;
+ u16 snd_nxt_state;
+ u16 rcv_nxt_state;
u32 peer_bearer_id;
u32 bearer_id;
u32 tolerance;
u32 abort_limit;
u32 state;
u16 peer_caps;
+ bool in_session;
bool active;
u32 silent_intv_cnt;
char if_name[TIPC_MAX_IF_NAME];
@@ -161,7 +165,8 @@ struct tipc_link {
u16 snd_nxt;
u16 last_retransm;
u16 window;
- u32 stale_count;
+ u16 stale_cnt;
+ unsigned long stale_limit;
/* Reception */
u16 rcv_nxt;
@@ -212,11 +217,6 @@ enum {
*/
#define TIPC_NACK_INTV (TIPC_MIN_LINK_WIN * 2)
-/* Wildcard value for link session numbers. When it is known that
- * peer endpoint is down, any session number must be accepted.
- */
-#define ANY_SESSION 0x10000
-
/* Link FSM states:
*/
enum {
@@ -297,11 +297,6 @@ static bool link_is_bc_rcvlink(struct tipc_link *l)
return ((l->bc_rcvlink == l) && !link_is_bc_sndlink(l));
}
-int tipc_link_is_active(struct tipc_link *l)
-{
- return l->active;
-}
-
void tipc_link_set_active(struct tipc_link *l, bool active)
{
l->active = active;
@@ -337,6 +332,11 @@ char tipc_link_plane(struct tipc_link *l)
return l->net_plane;
}
+void tipc_link_update_caps(struct tipc_link *l, u16 capabilities)
+{
+ l->peer_caps = capabilities;
+}
+
void tipc_link_add_bc_peer(struct tipc_link *snd_l,
struct tipc_link *uc_l,
struct sk_buff_head *xmitq)
@@ -469,7 +469,7 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id,
l->addr = peer;
l->peer_caps = peer_caps;
l->net = net;
- l->peer_session = ANY_SESSION;
+ l->in_session = false;
l->bearer_id = bearer_id;
l->tolerance = tolerance;
l->net_plane = net_plane;
@@ -838,7 +838,7 @@ void link_prepare_wakeup(struct tipc_link *l)
void tipc_link_reset(struct tipc_link *l)
{
- l->peer_session = ANY_SESSION;
+ l->in_session = false;
l->session++;
l->mtu = l->advertised_mtu;
__skb_queue_purge(&l->transmq);
@@ -857,10 +857,12 @@ void tipc_link_reset(struct tipc_link *l)
l->rcv_unacked = 0;
l->snd_nxt = 1;
l->rcv_nxt = 1;
+ l->snd_nxt_state = 1;
+ l->rcv_nxt_state = 1;
l->acked = 0;
l->silent_intv_cnt = 0;
l->rst_cnt = 0;
- l->stale_count = 0;
+ l->stale_cnt = 0;
l->bc_peer_is_up = false;
memset(&l->mon_state, 0, sizeof(l->mon_state));
tipc_link_reset_stats(l);
@@ -997,39 +999,41 @@ static void link_retransmit_failure(struct tipc_link *l, struct sk_buff *skb)
msg_seqno(hdr), msg_prevnode(hdr), msg_orignode(hdr));
}
-int tipc_link_retrans(struct tipc_link *l, struct tipc_link *nacker,
+/* tipc_link_retrans() - retransmit one or more packets
+ * @l: the link to transmit on
+ * @r: the receiving link ordering the retransmit. Same as l if unicast
+ * @from: retransmit from (inclusive) this sequence number
+ * @to: retransmit to (inclusive) this sequence number
+ * xmitq: queue for accumulating the retransmitted packets
+ */
+int tipc_link_retrans(struct tipc_link *l, struct tipc_link *r,
u16 from, u16 to, struct sk_buff_head *xmitq)
{
struct sk_buff *_skb, *skb = skb_peek(&l->transmq);
- struct tipc_msg *hdr;
- u16 ack = l->rcv_nxt - 1;
u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1;
+ u16 ack = l->rcv_nxt - 1;
+ struct tipc_msg *hdr;
if (!skb)
return 0;
/* Detect repeated retransmit failures on same packet */
- if (nacker->last_retransm != buf_seqno(skb)) {
- nacker->last_retransm = buf_seqno(skb);
- nacker->stale_count = 1;
- } else if (++nacker->stale_count > 100) {
+ if (r->last_retransm != buf_seqno(skb)) {
+ r->last_retransm = buf_seqno(skb);
+ r->stale_limit = jiffies + msecs_to_jiffies(l->tolerance);
+ } else if (++r->stale_cnt > 99 && time_after(jiffies, r->stale_limit)) {
link_retransmit_failure(l, skb);
- nacker->stale_count = 0;
if (link_is_bc_sndlink(l))
return TIPC_LINK_DOWN_EVT;
return tipc_link_fsm_evt(l, LINK_FAILURE_EVT);
}
- /* Move forward to where retransmission should start */
skb_queue_walk(&l->transmq, skb) {
- if (!less(buf_seqno(skb), from))
- break;
- }
-
- skb_queue_walk_from(&l->transmq, skb) {
- if (more(buf_seqno(skb), to))
- break;
hdr = buf_msg(skb);
+ if (less(msg_seqno(hdr), from))
+ continue;
+ if (more(msg_seqno(hdr), to))
+ break;
_skb = __pskb_copy(skb, MIN_H_SIZE, GFP_ATOMIC);
if (!_skb)
return 0;
@@ -1063,6 +1067,7 @@ static bool tipc_data_input(struct tipc_link *l, struct sk_buff *skb,
skb_queue_tail(mc_inputq, skb);
return true;
}
+ /* else: fall through */
case CONN_MANAGER:
skb_queue_tail(inputq, skb);
return true;
@@ -1271,6 +1276,7 @@ int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb,
/* Forward queues and wake up waiting users */
if (likely(tipc_link_release_pkts(l, msg_ack(hdr)))) {
+ l->stale_cnt = 0;
tipc_link_advance_backlog(l, xmitq);
if (unlikely(!skb_queue_empty(&l->wakeupq)))
link_prepare_wakeup(l);
@@ -1347,6 +1353,8 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe,
msg_set_seqno(hdr, l->snd_nxt + U16_MAX / 2);
if (mtyp == STATE_MSG) {
+ if (l->peer_caps & TIPC_LINK_PROTO_SEQNO)
+ msg_set_seqno(hdr, l->snd_nxt_state++);
msg_set_seq_gap(hdr, rcvgap);
msg_set_bc_gap(hdr, link_bc_rcv_gap(bcl));
msg_set_probe(hdr, probe);
@@ -1438,6 +1446,44 @@ tnl:
}
}
+/* tipc_link_validate_msg(): validate message against current link state
+ * Returns true if message should be accepted, otherwise false
+ */
+bool tipc_link_validate_msg(struct tipc_link *l, struct tipc_msg *hdr)
+{
+ u16 curr_session = l->peer_session;
+ u16 session = msg_session(hdr);
+ int mtyp = msg_type(hdr);
+
+ if (msg_user(hdr) != LINK_PROTOCOL)
+ return true;
+
+ switch (mtyp) {
+ case RESET_MSG:
+ if (!l->in_session)
+ return true;
+ /* Accept only RESET with new session number */
+ return more(session, curr_session);
+ case ACTIVATE_MSG:
+ if (!l->in_session)
+ return true;
+ /* Accept only ACTIVATE with new or current session number */
+ return !less(session, curr_session);
+ case STATE_MSG:
+ /* Accept only STATE with current session number */
+ if (!l->in_session)
+ return false;
+ if (session != curr_session)
+ return false;
+ if (!(l->peer_caps & TIPC_LINK_PROTO_SEQNO))
+ return true;
+ /* Accept only STATE with new sequence number */
+ return !less(msg_seqno(hdr), l->rcv_nxt_state);
+ default:
+ return false;
+ }
+}
+
/* tipc_link_proto_rcv(): receive link level protocol message :
* Note that network plane id propagates through the network, and may
* change at any time. The node with lowest numerical id determines
@@ -1471,17 +1517,12 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb,
hdr = buf_msg(skb);
data = msg_data(hdr);
+ if (!tipc_link_validate_msg(l, hdr))
+ goto exit;
+
switch (mtyp) {
case RESET_MSG:
-
- /* Ignore duplicate RESET with old session number */
- if ((less_eq(msg_session(hdr), l->peer_session)) &&
- (l->peer_session != ANY_SESSION))
- break;
- /* fall thru' */
-
case ACTIVATE_MSG:
-
/* Complete own link name with peer's interface name */
if_name = strrchr(l->name, ':') + 1;
if (sizeof(l->name) - (if_name - l->name) <= TIPC_MAX_IF_NAME)
@@ -1509,12 +1550,14 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb,
rc = TIPC_LINK_UP_EVT;
l->peer_session = msg_session(hdr);
+ l->in_session = true;
l->peer_bearer_id = msg_bearer_id(hdr);
if (l->mtu > msg_max_pkt(hdr))
l->mtu = msg_max_pkt(hdr);
break;
case STATE_MSG:
+ l->rcv_nxt_state = msg_seqno(hdr) + 1;
/* Update own tolerance if peer indicates a non-zero value */
if (in_range(peers_tol, TIPC_MIN_LINK_TOL, TIPC_MAX_LINK_TOL))
diff --git a/net/tipc/link.h b/net/tipc/link.h
index ec59348a81e8..7bc494a33fdf 100644
--- a/net/tipc/link.h
+++ b/net/tipc/link.h
@@ -110,6 +110,8 @@ char *tipc_link_name(struct tipc_link *l);
char tipc_link_plane(struct tipc_link *l);
int tipc_link_prio(struct tipc_link *l);
int tipc_link_window(struct tipc_link *l);
+void tipc_link_update_caps(struct tipc_link *l, u16 capabilities);
+bool tipc_link_validate_msg(struct tipc_link *l, struct tipc_msg *hdr);
unsigned long tipc_link_tolerance(struct tipc_link *l);
void tipc_link_set_tolerance(struct tipc_link *l, u32 tol,
struct sk_buff_head *xmitq);
diff --git a/net/tipc/msg.c b/net/tipc/msg.c
index b6c45dccba3d..b61891054709 100644
--- a/net/tipc/msg.c
+++ b/net/tipc/msg.c
@@ -416,26 +416,31 @@ bool tipc_msg_bundle(struct sk_buff *skb, struct tipc_msg *msg, u32 mtu)
*/
bool tipc_msg_extract(struct sk_buff *skb, struct sk_buff **iskb, int *pos)
{
- struct tipc_msg *msg;
- int imsz, offset;
+ struct tipc_msg *hdr, *ihdr;
+ int imsz;
*iskb = NULL;
if (unlikely(skb_linearize(skb)))
goto none;
- msg = buf_msg(skb);
- offset = msg_hdr_sz(msg) + *pos;
- if (unlikely(offset > (msg_size(msg) - MIN_H_SIZE)))
+ hdr = buf_msg(skb);
+ if (unlikely(*pos > (msg_data_sz(hdr) - MIN_H_SIZE)))
goto none;
- *iskb = skb_clone(skb, GFP_ATOMIC);
- if (unlikely(!*iskb))
+ ihdr = (struct tipc_msg *)(msg_data(hdr) + *pos);
+ imsz = msg_size(ihdr);
+
+ if ((*pos + imsz) > msg_data_sz(hdr))
goto none;
- skb_pull(*iskb, offset);
- imsz = msg_size(buf_msg(*iskb));
- skb_trim(*iskb, imsz);
+
+ *iskb = tipc_buf_acquire(imsz, GFP_ATOMIC);
+ if (!*iskb)
+ goto none;
+
+ skb_copy_to_linear_data(*iskb, ihdr, imsz);
if (unlikely(!tipc_msg_validate(iskb)))
goto none;
+
*pos += align(imsz);
return true;
none:
@@ -531,12 +536,6 @@ bool tipc_msg_reverse(u32 own_node, struct sk_buff **skb, int err)
msg_set_hdr_sz(hdr, BASIC_H_SIZE);
}
- if (skb_cloned(_skb) &&
- pskb_expand_head(_skb, BUF_HEADROOM, BUF_TAILROOM, GFP_ATOMIC))
- goto exit;
-
- /* reassign after skb header modifications */
- hdr = buf_msg(_skb);
/* Now reverse the concerned fields */
msg_set_errcode(hdr, err);
msg_set_non_seq(hdr, 0);
@@ -595,10 +594,6 @@ bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err)
if (!skb_cloned(skb))
return true;
- /* Unclone buffer in case it was bundled */
- if (pskb_expand_head(skb, BUF_HEADROOM, BUF_TAILROOM, GFP_ATOMIC))
- return false;
-
return true;
}
diff --git a/net/tipc/node.c b/net/tipc/node.c
index 0453bd451ce8..68014f1b6976 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -45,6 +45,7 @@
#include "netlink.h"
#define INVALID_NODE_SIG 0x10000
+#define NODE_CLEANUP_AFTER 300000
/* Flags used to take different actions according to flag type
* TIPC_NOTIFY_NODE_DOWN: notify node is down
@@ -96,6 +97,7 @@ struct tipc_bclink_entry {
* @link_id: local and remote bearer ids of changing link, if any
* @publ_list: list of publications
* @rcu: rcu struct for tipc_node
+ * @delete_at: indicates the time for deleting a down node
*/
struct tipc_node {
u32 addr;
@@ -121,6 +123,7 @@ struct tipc_node {
unsigned long keepalive_intv;
struct timer_list timer;
struct rcu_head rcu;
+ unsigned long delete_at;
};
/* Node FSM states and events:
@@ -160,6 +163,7 @@ static struct tipc_node *tipc_node_find(struct net *net, u32 addr);
static struct tipc_node *tipc_node_find_by_id(struct net *net, u8 *id);
static void tipc_node_put(struct tipc_node *node);
static bool node_is_up(struct tipc_node *n);
+static void tipc_node_delete_from_list(struct tipc_node *node);
struct tipc_sock_conn {
u32 port;
@@ -359,13 +363,24 @@ static struct tipc_node *tipc_node_create(struct net *net, u32 addr,
{
struct tipc_net *tn = net_generic(net, tipc_net_id);
struct tipc_node *n, *temp_node;
+ struct tipc_link *l;
+ int bearer_id;
int i;
spin_lock_bh(&tn->node_list_lock);
n = tipc_node_find(net, addr);
if (n) {
+ if (n->capabilities == capabilities)
+ goto exit;
/* Same node may come back with new capabilities */
+ write_lock_bh(&n->lock);
n->capabilities = capabilities;
+ for (bearer_id = 0; bearer_id < MAX_BEARERS; bearer_id++) {
+ l = n->links[bearer_id].link;
+ if (l)
+ tipc_link_update_caps(l, capabilities);
+ }
+ write_unlock_bh(&n->lock);
goto exit;
}
n = kzalloc(sizeof(*n), GFP_ATOMIC);
@@ -390,6 +405,7 @@ static struct tipc_node *tipc_node_create(struct net *net, u32 addr,
for (i = 0; i < MAX_BEARERS; i++)
spin_lock_init(&n->links[i].lock);
n->state = SELF_DOWN_PEER_LEAVING;
+ n->delete_at = jiffies + msecs_to_jiffies(NODE_CLEANUP_AFTER);
n->signature = INVALID_NODE_SIG;
n->active_links[0] = INVALID_BEARER_ID;
n->active_links[1] = INVALID_BEARER_ID;
@@ -433,11 +449,16 @@ static void tipc_node_calculate_timer(struct tipc_node *n, struct tipc_link *l)
tipc_link_set_abort_limit(l, tol / n->keepalive_intv);
}
-static void tipc_node_delete(struct tipc_node *node)
+static void tipc_node_delete_from_list(struct tipc_node *node)
{
list_del_rcu(&node->list);
hlist_del_rcu(&node->hash);
tipc_node_put(node);
+}
+
+static void tipc_node_delete(struct tipc_node *node)
+{
+ tipc_node_delete_from_list(node);
del_timer_sync(&node->timer);
tipc_node_put(node);
@@ -544,6 +565,42 @@ void tipc_node_remove_conn(struct net *net, u32 dnode, u32 port)
tipc_node_put(node);
}
+static void tipc_node_clear_links(struct tipc_node *node)
+{
+ int i;
+
+ for (i = 0; i < MAX_BEARERS; i++) {
+ struct tipc_link_entry *le = &node->links[i];
+
+ if (le->link) {
+ kfree(le->link);
+ le->link = NULL;
+ node->link_cnt--;
+ }
+ }
+}
+
+/* tipc_node_cleanup - delete nodes that does not
+ * have active links for NODE_CLEANUP_AFTER time
+ */
+static int tipc_node_cleanup(struct tipc_node *peer)
+{
+ struct tipc_net *tn = tipc_net(peer->net);
+ bool deleted = false;
+
+ spin_lock_bh(&tn->node_list_lock);
+ tipc_node_write_lock(peer);
+
+ if (!node_is_up(peer) && time_after(jiffies, peer->delete_at)) {
+ tipc_node_clear_links(peer);
+ tipc_node_delete_from_list(peer);
+ deleted = true;
+ }
+ tipc_node_write_unlock(peer);
+ spin_unlock_bh(&tn->node_list_lock);
+ return deleted;
+}
+
/* tipc_node_timeout - handle expiration of node timer
*/
static void tipc_node_timeout(struct timer_list *t)
@@ -551,21 +608,29 @@ static void tipc_node_timeout(struct timer_list *t)
struct tipc_node *n = from_timer(n, t, timer);
struct tipc_link_entry *le;
struct sk_buff_head xmitq;
+ int remains = n->link_cnt;
int bearer_id;
int rc = 0;
+ if (!node_is_up(n) && tipc_node_cleanup(n)) {
+ /*Removing the reference of Timer*/
+ tipc_node_put(n);
+ return;
+ }
+
__skb_queue_head_init(&xmitq);
- for (bearer_id = 0; bearer_id < MAX_BEARERS; bearer_id++) {
+ for (bearer_id = 0; remains && (bearer_id < MAX_BEARERS); bearer_id++) {
tipc_node_read_lock(n);
le = &n->links[bearer_id];
- spin_lock_bh(&le->lock);
if (le->link) {
+ spin_lock_bh(&le->lock);
/* Link tolerance may change asynchronously: */
tipc_node_calculate_timer(n, le->link);
rc = tipc_link_timeout(le->link, &xmitq);
+ spin_unlock_bh(&le->lock);
+ remains--;
}
- spin_unlock_bh(&le->lock);
tipc_node_read_unlock(n);
tipc_bearer_xmit(n->net, bearer_id, &xmitq, &le->maddr);
if (rc & TIPC_LINK_DOWN_EVT)
@@ -1174,6 +1239,7 @@ static void node_lost_contact(struct tipc_node *n,
uint i;
pr_debug("Lost contact with %x\n", n->addr);
+ n->delete_at = jiffies + msecs_to_jiffies(NODE_CLEANUP_AFTER);
/* Clean up broadcast state */
tipc_bcast_remove_peer(n->net, n->bc_entry.link);
@@ -1481,7 +1547,7 @@ static void tipc_node_bc_rcv(struct net *net, struct sk_buff *skb, int bearer_id
* tipc_node_check_state - check and if necessary update node state
* @skb: TIPC packet
* @bearer_id: identity of bearer delivering the packet
- * Returns true if state is ok, otherwise consumes buffer and returns false
+ * Returns true if state and msg are ok, otherwise false
*/
static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb,
int bearer_id, struct sk_buff_head *xmitq)
@@ -1515,6 +1581,9 @@ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb,
}
}
+ if (!tipc_link_validate_msg(l, hdr))
+ return false;
+
/* Check and update node accesibility if applicable */
if (state == SELF_UP_PEER_COMING) {
if (!tipc_link_is_up(l))
@@ -1743,7 +1812,6 @@ int tipc_nl_peer_rm(struct sk_buff *skb, struct genl_info *info)
struct tipc_node *peer;
u32 addr;
int err;
- int i;
/* We identify the peer by its net */
if (!info->attrs[TIPC_NLA_NET])
@@ -1778,15 +1846,7 @@ int tipc_nl_peer_rm(struct sk_buff *skb, struct genl_info *info)
goto err_out;
}
- for (i = 0; i < MAX_BEARERS; i++) {
- struct tipc_link_entry *le = &peer->links[i];
-
- if (le->link) {
- kfree(le->link);
- le->link = NULL;
- peer->link_cnt--;
- }
- }
+ tipc_node_clear_links(peer);
tipc_node_write_unlock(peer);
tipc_node_delete(peer);
diff --git a/net/tipc/node.h b/net/tipc/node.h
index 846c8f240872..48b3298a248d 100644
--- a/net/tipc/node.h
+++ b/net/tipc/node.h
@@ -49,14 +49,16 @@ enum {
TIPC_BCAST_STATE_NACK = (1 << 2),
TIPC_BLOCK_FLOWCTL = (1 << 3),
TIPC_BCAST_RCAST = (1 << 4),
- TIPC_NODE_ID128 = (1 << 5)
+ TIPC_NODE_ID128 = (1 << 5),
+ TIPC_LINK_PROTO_SEQNO = (1 << 6)
};
-#define TIPC_NODE_CAPABILITIES (TIPC_BCAST_SYNCH | \
- TIPC_BCAST_STATE_NACK | \
- TIPC_BCAST_RCAST | \
- TIPC_BLOCK_FLOWCTL | \
- TIPC_NODE_ID128)
+#define TIPC_NODE_CAPABILITIES (TIPC_BCAST_SYNCH | \
+ TIPC_BCAST_STATE_NACK | \
+ TIPC_BCAST_RCAST | \
+ TIPC_BLOCK_FLOWCTL | \
+ TIPC_NODE_ID128 | \
+ TIPC_LINK_PROTO_SEQNO)
#define INVALID_BEARER_ID -1
void tipc_node_stop(struct net *net);
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 930852c54d7a..3d21414ba357 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -3320,6 +3320,11 @@ int tipc_sk_fill_sock_diag(struct sk_buff *skb, struct netlink_callback *cb,
goto stat_msg_cancel;
nla_nest_end(skb, stat);
+
+ if (tsk->group)
+ if (tipc_group_fill_sock_diag(tsk->group, skb))
+ goto stat_msg_cancel;
+
nla_nest_end(skb, attrs);
return 0;
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index a7a8f8e20ff3..1e968d238adf 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -52,9 +52,12 @@ static DEFINE_SPINLOCK(tls_device_lock);
static void tls_device_free_ctx(struct tls_context *ctx)
{
- struct tls_offload_context *offload_ctx = tls_offload_ctx(ctx);
+ if (ctx->tx_conf == TLS_HW)
+ kfree(tls_offload_ctx_tx(ctx));
+
+ if (ctx->rx_conf == TLS_HW)
+ kfree(tls_offload_ctx_rx(ctx));
- kfree(offload_ctx);
kfree(ctx);
}
@@ -71,10 +74,11 @@ static void tls_device_gc_task(struct work_struct *work)
list_for_each_entry_safe(ctx, tmp, &gc_list, list) {
struct net_device *netdev = ctx->netdev;
- if (netdev) {
+ if (netdev && ctx->tx_conf == TLS_HW) {
netdev->tlsdev_ops->tls_dev_del(netdev, ctx,
TLS_OFFLOAD_CTX_DIR_TX);
dev_put(netdev);
+ ctx->netdev = NULL;
}
list_del(&ctx->list);
@@ -82,6 +86,22 @@ static void tls_device_gc_task(struct work_struct *work)
}
}
+static void tls_device_attach(struct tls_context *ctx, struct sock *sk,
+ struct net_device *netdev)
+{
+ if (sk->sk_destruct != tls_device_sk_destruct) {
+ refcount_set(&ctx->refcount, 1);
+ dev_hold(netdev);
+ ctx->netdev = netdev;
+ spin_lock_irq(&tls_device_lock);
+ list_add_tail(&ctx->list, &tls_device_list);
+ spin_unlock_irq(&tls_device_lock);
+
+ ctx->sk_destruct = sk->sk_destruct;
+ sk->sk_destruct = tls_device_sk_destruct;
+ }
+}
+
static void tls_device_queue_ctx_destruction(struct tls_context *ctx)
{
unsigned long flags;
@@ -125,7 +145,7 @@ static void destroy_record(struct tls_record_info *record)
kfree(record);
}
-static void delete_all_records(struct tls_offload_context *offload_ctx)
+static void delete_all_records(struct tls_offload_context_tx *offload_ctx)
{
struct tls_record_info *info, *temp;
@@ -141,14 +161,14 @@ static void tls_icsk_clean_acked(struct sock *sk, u32 acked_seq)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct tls_record_info *info, *temp;
- struct tls_offload_context *ctx;
+ struct tls_offload_context_tx *ctx;
u64 deleted_records = 0;
unsigned long flags;
if (!tls_ctx)
return;
- ctx = tls_offload_ctx(tls_ctx);
+ ctx = tls_offload_ctx_tx(tls_ctx);
spin_lock_irqsave(&ctx->lock, flags);
info = ctx->retransmit_hint;
@@ -179,15 +199,17 @@ static void tls_icsk_clean_acked(struct sock *sk, u32 acked_seq)
void tls_device_sk_destruct(struct sock *sk)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
- struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx);
+ struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx);
- if (ctx->open_record)
- destroy_record(ctx->open_record);
+ tls_ctx->sk_destruct(sk);
- delete_all_records(ctx);
- crypto_free_aead(ctx->aead_send);
- ctx->sk_destruct(sk);
- clean_acked_data_disable(inet_csk(sk));
+ if (tls_ctx->tx_conf == TLS_HW) {
+ if (ctx->open_record)
+ destroy_record(ctx->open_record);
+ delete_all_records(ctx);
+ crypto_free_aead(ctx->aead_send);
+ clean_acked_data_disable(inet_csk(sk));
+ }
if (refcount_dec_and_test(&tls_ctx->refcount))
tls_device_queue_ctx_destruction(tls_ctx);
@@ -219,7 +241,7 @@ static void tls_append_frag(struct tls_record_info *record,
static int tls_push_record(struct sock *sk,
struct tls_context *ctx,
- struct tls_offload_context *offload_ctx,
+ struct tls_offload_context_tx *offload_ctx,
struct tls_record_info *record,
struct page_frag *pfrag,
int flags,
@@ -264,7 +286,7 @@ static int tls_push_record(struct sock *sk,
return tls_push_sg(sk, ctx, offload_ctx->sg_tx_data, 0, flags);
}
-static int tls_create_new_record(struct tls_offload_context *offload_ctx,
+static int tls_create_new_record(struct tls_offload_context_tx *offload_ctx,
struct page_frag *pfrag,
size_t prepend_size)
{
@@ -290,7 +312,7 @@ static int tls_create_new_record(struct tls_offload_context *offload_ctx,
}
static int tls_do_allocation(struct sock *sk,
- struct tls_offload_context *offload_ctx,
+ struct tls_offload_context_tx *offload_ctx,
struct page_frag *pfrag,
size_t prepend_size)
{
@@ -324,7 +346,7 @@ static int tls_push_data(struct sock *sk,
unsigned char record_type)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
- struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx);
+ struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx);
int tls_push_record_flags = flags | MSG_SENDPAGE_NOTLAST;
int more = flags & (MSG_SENDPAGE_NOTLAST | MSG_MORE);
struct tls_record_info *record = ctx->open_record;
@@ -477,7 +499,7 @@ out:
return rc;
}
-struct tls_record_info *tls_get_record(struct tls_offload_context *context,
+struct tls_record_info *tls_get_record(struct tls_offload_context_tx *context,
u32 seq, u64 *p_record_sn)
{
u64 record_sn = context->hint_record_sn;
@@ -520,11 +542,123 @@ static int tls_device_push_pending_record(struct sock *sk, int flags)
return tls_push_data(sk, &msg_iter, 0, flags, TLS_RECORD_TYPE_DATA);
}
+void handle_device_resync(struct sock *sk, u32 seq, u64 rcd_sn)
+{
+ struct tls_context *tls_ctx = tls_get_ctx(sk);
+ struct net_device *netdev = tls_ctx->netdev;
+ struct tls_offload_context_rx *rx_ctx;
+ u32 is_req_pending;
+ s64 resync_req;
+ u32 req_seq;
+
+ if (tls_ctx->rx_conf != TLS_HW)
+ return;
+
+ rx_ctx = tls_offload_ctx_rx(tls_ctx);
+ resync_req = atomic64_read(&rx_ctx->resync_req);
+ req_seq = ntohl(resync_req >> 32) - ((u32)TLS_HEADER_SIZE - 1);
+ is_req_pending = resync_req;
+
+ if (unlikely(is_req_pending) && req_seq == seq &&
+ atomic64_try_cmpxchg(&rx_ctx->resync_req, &resync_req, 0))
+ netdev->tlsdev_ops->tls_dev_resync_rx(netdev, sk,
+ seq + TLS_HEADER_SIZE - 1,
+ rcd_sn);
+}
+
+static int tls_device_reencrypt(struct sock *sk, struct sk_buff *skb)
+{
+ struct strp_msg *rxm = strp_msg(skb);
+ int err = 0, offset = rxm->offset, copy, nsg;
+ struct sk_buff *skb_iter, *unused;
+ struct scatterlist sg[1];
+ char *orig_buf, *buf;
+
+ orig_buf = kmalloc(rxm->full_len + TLS_HEADER_SIZE +
+ TLS_CIPHER_AES_GCM_128_IV_SIZE, sk->sk_allocation);
+ if (!orig_buf)
+ return -ENOMEM;
+ buf = orig_buf;
+
+ nsg = skb_cow_data(skb, 0, &unused);
+ if (unlikely(nsg < 0)) {
+ err = nsg;
+ goto free_buf;
+ }
+
+ sg_init_table(sg, 1);
+ sg_set_buf(&sg[0], buf,
+ rxm->full_len + TLS_HEADER_SIZE +
+ TLS_CIPHER_AES_GCM_128_IV_SIZE);
+ skb_copy_bits(skb, offset, buf,
+ TLS_HEADER_SIZE + TLS_CIPHER_AES_GCM_128_IV_SIZE);
+
+ /* We are interested only in the decrypted data not the auth */
+ err = decrypt_skb(sk, skb, sg);
+ if (err != -EBADMSG)
+ goto free_buf;
+ else
+ err = 0;
+
+ copy = min_t(int, skb_pagelen(skb) - offset,
+ rxm->full_len - TLS_CIPHER_AES_GCM_128_TAG_SIZE);
+
+ if (skb->decrypted)
+ skb_store_bits(skb, offset, buf, copy);
+
+ offset += copy;
+ buf += copy;
+
+ skb_walk_frags(skb, skb_iter) {
+ copy = min_t(int, skb_iter->len,
+ rxm->full_len - offset + rxm->offset -
+ TLS_CIPHER_AES_GCM_128_TAG_SIZE);
+
+ if (skb_iter->decrypted)
+ skb_store_bits(skb_iter, offset, buf, copy);
+
+ offset += copy;
+ buf += copy;
+ }
+
+free_buf:
+ kfree(orig_buf);
+ return err;
+}
+
+int tls_device_decrypted(struct sock *sk, struct sk_buff *skb)
+{
+ struct tls_context *tls_ctx = tls_get_ctx(sk);
+ struct tls_offload_context_rx *ctx = tls_offload_ctx_rx(tls_ctx);
+ int is_decrypted = skb->decrypted;
+ int is_encrypted = !is_decrypted;
+ struct sk_buff *skb_iter;
+
+ /* Skip if it is already decrypted */
+ if (ctx->sw.decrypted)
+ return 0;
+
+ /* Check if all the data is decrypted already */
+ skb_walk_frags(skb, skb_iter) {
+ is_decrypted &= skb_iter->decrypted;
+ is_encrypted &= !skb_iter->decrypted;
+ }
+
+ ctx->sw.decrypted |= is_decrypted;
+
+ /* Return immedeatly if the record is either entirely plaintext or
+ * entirely ciphertext. Otherwise handle reencrypt partially decrypted
+ * record.
+ */
+ return (is_encrypted || is_decrypted) ? 0 :
+ tls_device_reencrypt(sk, skb);
+}
+
int tls_set_device_offload(struct sock *sk, struct tls_context *ctx)
{
u16 nonce_size, tag_size, iv_size, rec_seq_size;
struct tls_record_info *start_marker_record;
- struct tls_offload_context *offload_ctx;
+ struct tls_offload_context_tx *offload_ctx;
struct tls_crypto_info *crypto_info;
struct net_device *netdev;
char *iv, *rec_seq;
@@ -546,7 +680,7 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx)
goto out;
}
- offload_ctx = kzalloc(TLS_OFFLOAD_CONTEXT_SIZE, GFP_KERNEL);
+ offload_ctx = kzalloc(TLS_OFFLOAD_CONTEXT_SIZE_TX, GFP_KERNEL);
if (!offload_ctx) {
rc = -ENOMEM;
goto free_marker_record;
@@ -609,7 +743,6 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx)
clean_acked_data_enable(inet_csk(sk), &tls_icsk_clean_acked);
ctx->push_pending_record = tls_device_push_pending_record;
- offload_ctx->sk_destruct = sk->sk_destruct;
/* TLS offload is greatly simplified if we don't send
* SKBs where only part of the payload needs to be encrypted.
@@ -619,8 +752,6 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx)
if (skb)
TCP_SKB_CB(skb)->eor = 1;
- refcount_set(&ctx->refcount, 1);
-
/* We support starting offload on multiple sockets
* concurrently, so we only need a read lock here.
* This lock must precede get_netdev_for_sock to prevent races between
@@ -655,19 +786,14 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx)
if (rc)
goto release_netdev;
- ctx->netdev = netdev;
-
- spin_lock_irq(&tls_device_lock);
- list_add_tail(&ctx->list, &tls_device_list);
- spin_unlock_irq(&tls_device_lock);
+ tls_device_attach(ctx, sk, netdev);
- sk->sk_validate_xmit_skb = tls_validate_xmit_skb;
/* following this assignment tls_is_sk_tx_device_offloaded
* will return true and the context might be accessed
* by the netdev's xmit function.
*/
- smp_store_release(&sk->sk_destruct,
- &tls_device_sk_destruct);
+ smp_store_release(&sk->sk_validate_xmit_skb, tls_validate_xmit_skb);
+ dev_put(netdev);
up_read(&device_offload_lock);
goto out;
@@ -690,6 +816,105 @@ out:
return rc;
}
+int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx)
+{
+ struct tls_offload_context_rx *context;
+ struct net_device *netdev;
+ int rc = 0;
+
+ /* We support starting offload on multiple sockets
+ * concurrently, so we only need a read lock here.
+ * This lock must precede get_netdev_for_sock to prevent races between
+ * NETDEV_DOWN and setsockopt.
+ */
+ down_read(&device_offload_lock);
+ netdev = get_netdev_for_sock(sk);
+ if (!netdev) {
+ pr_err_ratelimited("%s: netdev not found\n", __func__);
+ rc = -EINVAL;
+ goto release_lock;
+ }
+
+ if (!(netdev->features & NETIF_F_HW_TLS_RX)) {
+ pr_err_ratelimited("%s: netdev %s with no TLS offload\n",
+ __func__, netdev->name);
+ rc = -ENOTSUPP;
+ goto release_netdev;
+ }
+
+ /* Avoid offloading if the device is down
+ * We don't want to offload new flows after
+ * the NETDEV_DOWN event
+ */
+ if (!(netdev->flags & IFF_UP)) {
+ rc = -EINVAL;
+ goto release_netdev;
+ }
+
+ context = kzalloc(TLS_OFFLOAD_CONTEXT_SIZE_RX, GFP_KERNEL);
+ if (!context) {
+ rc = -ENOMEM;
+ goto release_netdev;
+ }
+
+ ctx->priv_ctx_rx = context;
+ rc = tls_set_sw_offload(sk, ctx, 0);
+ if (rc)
+ goto release_ctx;
+
+ rc = netdev->tlsdev_ops->tls_dev_add(netdev, sk, TLS_OFFLOAD_CTX_DIR_RX,
+ &ctx->crypto_recv,
+ tcp_sk(sk)->copied_seq);
+ if (rc) {
+ pr_err_ratelimited("%s: The netdev has refused to offload this socket\n",
+ __func__);
+ goto free_sw_resources;
+ }
+
+ tls_device_attach(ctx, sk, netdev);
+ goto release_netdev;
+
+free_sw_resources:
+ tls_sw_free_resources_rx(sk);
+release_ctx:
+ ctx->priv_ctx_rx = NULL;
+release_netdev:
+ dev_put(netdev);
+release_lock:
+ up_read(&device_offload_lock);
+ return rc;
+}
+
+void tls_device_offload_cleanup_rx(struct sock *sk)
+{
+ struct tls_context *tls_ctx = tls_get_ctx(sk);
+ struct net_device *netdev;
+
+ down_read(&device_offload_lock);
+ netdev = tls_ctx->netdev;
+ if (!netdev)
+ goto out;
+
+ if (!(netdev->features & NETIF_F_HW_TLS_RX)) {
+ pr_err_ratelimited("%s: device is missing NETIF_F_HW_TLS_RX cap\n",
+ __func__);
+ goto out;
+ }
+
+ netdev->tlsdev_ops->tls_dev_del(netdev, tls_ctx,
+ TLS_OFFLOAD_CTX_DIR_RX);
+
+ if (tls_ctx->tx_conf != TLS_HW) {
+ dev_put(netdev);
+ tls_ctx->netdev = NULL;
+ }
+out:
+ up_read(&device_offload_lock);
+ kfree(tls_ctx->rx.rec_seq);
+ kfree(tls_ctx->rx.iv);
+ tls_sw_release_resources_rx(sk);
+}
+
static int tls_device_down(struct net_device *netdev)
{
struct tls_context *ctx, *tmp;
@@ -710,8 +935,12 @@ static int tls_device_down(struct net_device *netdev)
spin_unlock_irqrestore(&tls_device_lock, flags);
list_for_each_entry_safe(ctx, tmp, &list, list) {
- netdev->tlsdev_ops->tls_dev_del(netdev, ctx,
- TLS_OFFLOAD_CTX_DIR_TX);
+ if (ctx->tx_conf == TLS_HW)
+ netdev->tlsdev_ops->tls_dev_del(netdev, ctx,
+ TLS_OFFLOAD_CTX_DIR_TX);
+ if (ctx->rx_conf == TLS_HW)
+ netdev->tlsdev_ops->tls_dev_del(netdev, ctx,
+ TLS_OFFLOAD_CTX_DIR_RX);
ctx->netdev = NULL;
dev_put(netdev);
list_del_init(&ctx->list);
@@ -732,12 +961,16 @@ static int tls_dev_event(struct notifier_block *this, unsigned long event,
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
- if (!(dev->features & NETIF_F_HW_TLS_TX))
+ if (!(dev->features & (NETIF_F_HW_TLS_RX | NETIF_F_HW_TLS_TX)))
return NOTIFY_DONE;
switch (event) {
case NETDEV_REGISTER:
case NETDEV_FEAT_CHANGE:
+ if ((dev->features & NETIF_F_HW_TLS_RX) &&
+ !dev->tlsdev_ops->tls_dev_resync_rx)
+ return NOTIFY_BAD;
+
if (dev->tlsdev_ops &&
dev->tlsdev_ops->tls_dev_add &&
dev->tlsdev_ops->tls_dev_del)
diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c
index 748914abdb60..e3313c45663f 100644
--- a/net/tls/tls_device_fallback.c
+++ b/net/tls/tls_device_fallback.c
@@ -214,7 +214,7 @@ static void complete_skb(struct sk_buff *nskb, struct sk_buff *skb, int headln)
static int fill_sg_in(struct scatterlist *sg_in,
struct sk_buff *skb,
- struct tls_offload_context *ctx,
+ struct tls_offload_context_tx *ctx,
u64 *rcd_sn,
s32 *sync_size,
int *resync_sgs)
@@ -299,7 +299,7 @@ static struct sk_buff *tls_enc_skb(struct tls_context *tls_ctx,
s32 sync_size, u64 rcd_sn)
{
int tcp_payload_offset = skb_transport_offset(skb) + tcp_hdrlen(skb);
- struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx);
+ struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx);
int payload_len = skb->len - tcp_payload_offset;
void *buf, *iv, *aad, *dummy_buf;
struct aead_request *aead_req;
@@ -361,7 +361,7 @@ static struct sk_buff *tls_sw_fallback(struct sock *sk, struct sk_buff *skb)
{
int tcp_payload_offset = skb_transport_offset(skb) + tcp_hdrlen(skb);
struct tls_context *tls_ctx = tls_get_ctx(sk);
- struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx);
+ struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx);
int payload_len = skb->len - tcp_payload_offset;
struct scatterlist *sg_in, sg_out[3];
struct sk_buff *nskb = NULL;
@@ -413,9 +413,10 @@ struct sk_buff *tls_validate_xmit_skb(struct sock *sk,
return tls_sw_fallback(sk, skb);
}
+EXPORT_SYMBOL_GPL(tls_validate_xmit_skb);
int tls_sw_fallback_init(struct sock *sk,
- struct tls_offload_context *offload_ctx,
+ struct tls_offload_context_tx *offload_ctx,
struct tls_crypto_info *crypto_info)
{
const u8 *key;
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index 301f22430469..b09867c8b817 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -51,15 +51,6 @@ enum {
TLSV6,
TLS_NUM_PROTS,
};
-enum {
- TLS_BASE,
- TLS_SW,
-#ifdef CONFIG_TLS_DEVICE
- TLS_HW,
-#endif
- TLS_HW_RECORD,
- TLS_NUM_CONFIG,
-};
static struct proto *saved_tcpv6_prot;
static DEFINE_MUTEX(tcpv6_prot_mutex);
@@ -290,7 +281,10 @@ static void tls_sk_proto_close(struct sock *sk, long timeout)
}
#ifdef CONFIG_TLS_DEVICE
- if (ctx->tx_conf != TLS_HW) {
+ if (ctx->rx_conf == TLS_HW)
+ tls_device_offload_cleanup_rx(sk);
+
+ if (ctx->tx_conf != TLS_HW && ctx->rx_conf != TLS_HW) {
#else
{
#endif
@@ -470,8 +464,16 @@ static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval,
conf = TLS_SW;
}
} else {
- rc = tls_set_sw_offload(sk, ctx, 0);
- conf = TLS_SW;
+#ifdef CONFIG_TLS_DEVICE
+ rc = tls_set_device_offload_rx(sk, ctx);
+ conf = TLS_HW;
+ if (rc) {
+#else
+ {
+#endif
+ rc = tls_set_sw_offload(sk, ctx, 0);
+ conf = TLS_SW;
+ }
}
if (rc)
@@ -629,6 +631,12 @@ static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG],
prot[TLS_HW][TLS_SW] = prot[TLS_BASE][TLS_SW];
prot[TLS_HW][TLS_SW].sendmsg = tls_device_sendmsg;
prot[TLS_HW][TLS_SW].sendpage = tls_device_sendpage;
+
+ prot[TLS_BASE][TLS_HW] = prot[TLS_BASE][TLS_SW];
+
+ prot[TLS_SW][TLS_HW] = prot[TLS_SW][TLS_SW];
+
+ prot[TLS_HW][TLS_HW] = prot[TLS_HW][TLS_SW];
#endif
prot[TLS_HW_RECORD][TLS_HW_RECORD] = *base;
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 4618f1c31137..0c2d029c9d4c 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -53,18 +53,14 @@ static int tls_do_decryption(struct sock *sk,
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
- struct strp_msg *rxm = strp_msg(skb);
struct aead_request *aead_req;
int ret;
- unsigned int req_size = sizeof(struct aead_request) +
- crypto_aead_reqsize(ctx->aead_recv);
- aead_req = kzalloc(req_size, flags);
+ aead_req = aead_request_alloc(ctx->aead_recv, flags);
if (!aead_req)
return -ENOMEM;
- aead_request_set_tfm(aead_req, ctx->aead_recv);
aead_request_set_ad(aead_req, TLS_AAD_SPACE_SIZE);
aead_request_set_crypt(aead_req, sgin, sgout,
data_len + tls_ctx->rx.tag_size,
@@ -74,19 +70,7 @@ static int tls_do_decryption(struct sock *sk,
ret = crypto_wait_req(crypto_aead_decrypt(aead_req), &ctx->async_wait);
- if (ret < 0)
- goto out;
-
- rxm->offset += tls_ctx->rx.prepend_size;
- rxm->full_len -= tls_ctx->rx.overhead_size;
- tls_advance_record_sn(sk, &tls_ctx->rx);
-
- ctx->decrypted = true;
-
- ctx->saved_data_ready(sk);
-
-out:
- kfree(aead_req);
+ aead_request_free(aead_req);
return ret;
}
@@ -224,8 +208,7 @@ static int tls_push_record(struct sock *sk, int flags,
struct aead_request *req;
int rc;
- req = kzalloc(sizeof(struct aead_request) +
- crypto_aead_reqsize(ctx->aead_send), sk->sk_allocation);
+ req = aead_request_alloc(ctx->aead_send, sk->sk_allocation);
if (!req)
return -ENOMEM;
@@ -267,7 +250,7 @@ static int tls_push_record(struct sock *sk, int flags,
tls_advance_record_sn(sk, &tls_ctx->tx);
out_req:
- kfree(req);
+ aead_request_free(req);
return rc;
}
@@ -280,7 +263,7 @@ static int zerocopy_from_iter(struct sock *sk, struct iov_iter *from,
int length, int *pages_used,
unsigned int *size_used,
struct scatterlist *to, int to_max_pages,
- bool charge)
+ bool charge, bool revert)
{
struct page *pages[MAX_SKB_FRAGS];
@@ -331,6 +314,8 @@ static int zerocopy_from_iter(struct sock *sk, struct iov_iter *from,
out:
*size_used = size;
*pages_used = num_elem;
+ if (revert)
+ iov_iter_revert(from, size);
return rc;
}
@@ -432,7 +417,7 @@ alloc_encrypted:
&ctx->sg_plaintext_size,
ctx->sg_plaintext_data,
ARRAY_SIZE(ctx->sg_plaintext_data),
- true);
+ true, false);
if (ret)
goto fallback_to_reg_send;
@@ -670,8 +655,38 @@ static struct sk_buff *tls_wait_data(struct sock *sk, int flags,
return skb;
}
-static int decrypt_skb(struct sock *sk, struct sk_buff *skb,
- struct scatterlist *sgout)
+static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb,
+ struct scatterlist *sgout, bool *zc)
+{
+ struct tls_context *tls_ctx = tls_get_ctx(sk);
+ struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
+ struct strp_msg *rxm = strp_msg(skb);
+ int err = 0;
+
+#ifdef CONFIG_TLS_DEVICE
+ err = tls_device_decrypted(sk, skb);
+ if (err < 0)
+ return err;
+#endif
+ if (!ctx->decrypted) {
+ err = decrypt_skb(sk, skb, sgout);
+ if (err < 0)
+ return err;
+ } else {
+ *zc = false;
+ }
+
+ rxm->offset += tls_ctx->rx.prepend_size;
+ rxm->full_len -= tls_ctx->rx.overhead_size;
+ tls_advance_record_sn(sk, &tls_ctx->rx);
+ ctx->decrypted = true;
+ ctx->saved_data_ready(sk);
+
+ return err;
+}
+
+int decrypt_skb(struct sock *sk, struct sk_buff *skb,
+ struct scatterlist *sgout)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
@@ -817,11 +832,11 @@ int tls_sw_recvmsg(struct sock *sk,
err = zerocopy_from_iter(sk, &msg->msg_iter,
to_copy, &pages,
&chunk, &sgin[1],
- MAX_SKB_FRAGS, false);
+ MAX_SKB_FRAGS, false, true);
if (err < 0)
goto fallback_to_reg_recv;
- err = decrypt_skb(sk, skb, sgin);
+ err = decrypt_skb_update(sk, skb, sgin, &zc);
for (; pages > 0; pages--)
put_page(sg_page(&sgin[pages]));
if (err < 0) {
@@ -830,7 +845,7 @@ int tls_sw_recvmsg(struct sock *sk,
}
} else {
fallback_to_reg_recv:
- err = decrypt_skb(sk, skb, NULL);
+ err = decrypt_skb_update(sk, skb, NULL, &zc);
if (err < 0) {
tls_err_abort(sk, EBADMSG);
goto recv_end;
@@ -885,6 +900,7 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos,
int err = 0;
long timeo;
int chunk;
+ bool zc;
lock_sock(sk);
@@ -901,7 +917,7 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos,
}
if (!ctx->decrypted) {
- err = decrypt_skb(sk, skb, NULL);
+ err = decrypt_skb_update(sk, skb, NULL, &zc);
if (err < 0) {
tls_err_abort(sk, EBADMSG);
@@ -947,7 +963,7 @@ static int tls_read_size(struct strparser *strp, struct sk_buff *skb)
{
struct tls_context *tls_ctx = tls_get_ctx(strp->sk);
struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
- char header[tls_ctx->rx.prepend_size];
+ char header[TLS_HEADER_SIZE + MAX_IV_SIZE];
struct strp_msg *rxm = strp_msg(skb);
size_t cipher_overhead;
size_t data_len = 0;
@@ -957,6 +973,12 @@ static int tls_read_size(struct strparser *strp, struct sk_buff *skb)
if (rxm->offset + tls_ctx->rx.prepend_size > skb->len)
return 0;
+ /* Sanity-check size of on-stack buffer. */
+ if (WARN_ON(tls_ctx->rx.prepend_size > sizeof(header))) {
+ ret = -EINVAL;
+ goto read_failure;
+ }
+
/* Linearize header to local buffer */
ret = skb_copy_bits(skb, rxm->offset, header, tls_ctx->rx.prepend_size);
@@ -984,6 +1006,10 @@ static int tls_read_size(struct strparser *strp, struct sk_buff *skb)
goto read_failure;
}
+#ifdef CONFIG_TLS_DEVICE
+ handle_device_resync(strp->sk, TCP_SKB_CB(skb)->seq + rxm->offset,
+ *(u64*)tls_ctx->rx.rec_seq);
+#endif
return data_len + TLS_HEADER_SIZE;
read_failure:
@@ -996,9 +1022,6 @@ static void tls_queue(struct strparser *strp, struct sk_buff *skb)
{
struct tls_context *tls_ctx = tls_get_ctx(strp->sk);
struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
- struct strp_msg *rxm;
-
- rxm = strp_msg(skb);
ctx->decrypted = false;
@@ -1028,7 +1051,7 @@ void tls_sw_free_resources_tx(struct sock *sk)
kfree(ctx);
}
-void tls_sw_free_resources_rx(struct sock *sk)
+void tls_sw_release_resources_rx(struct sock *sk)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
@@ -1047,6 +1070,14 @@ void tls_sw_free_resources_rx(struct sock *sk)
strp_done(&ctx->strp);
lock_sock(sk);
}
+}
+
+void tls_sw_free_resources_rx(struct sock *sk)
+{
+ struct tls_context *tls_ctx = tls_get_ctx(sk);
+ struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
+
+ tls_sw_release_resources_rx(sk);
kfree(ctx);
}
@@ -1071,28 +1102,38 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
}
if (tx) {
- sw_ctx_tx = kzalloc(sizeof(*sw_ctx_tx), GFP_KERNEL);
- if (!sw_ctx_tx) {
- rc = -ENOMEM;
- goto out;
+ if (!ctx->priv_ctx_tx) {
+ sw_ctx_tx = kzalloc(sizeof(*sw_ctx_tx), GFP_KERNEL);
+ if (!sw_ctx_tx) {
+ rc = -ENOMEM;
+ goto out;
+ }
+ ctx->priv_ctx_tx = sw_ctx_tx;
+ } else {
+ sw_ctx_tx =
+ (struct tls_sw_context_tx *)ctx->priv_ctx_tx;
}
- crypto_init_wait(&sw_ctx_tx->async_wait);
- ctx->priv_ctx_tx = sw_ctx_tx;
} else {
- sw_ctx_rx = kzalloc(sizeof(*sw_ctx_rx), GFP_KERNEL);
- if (!sw_ctx_rx) {
- rc = -ENOMEM;
- goto out;
+ if (!ctx->priv_ctx_rx) {
+ sw_ctx_rx = kzalloc(sizeof(*sw_ctx_rx), GFP_KERNEL);
+ if (!sw_ctx_rx) {
+ rc = -ENOMEM;
+ goto out;
+ }
+ ctx->priv_ctx_rx = sw_ctx_rx;
+ } else {
+ sw_ctx_rx =
+ (struct tls_sw_context_rx *)ctx->priv_ctx_rx;
}
- crypto_init_wait(&sw_ctx_rx->async_wait);
- ctx->priv_ctx_rx = sw_ctx_rx;
}
if (tx) {
+ crypto_init_wait(&sw_ctx_tx->async_wait);
crypto_info = &ctx->crypto_send;
cctx = &ctx->tx;
aead = &sw_ctx_tx->aead_send;
} else {
+ crypto_init_wait(&sw_ctx_rx->async_wait);
crypto_info = &ctx->crypto_recv;
cctx = &ctx->rx;
aead = &sw_ctx_rx->aead_recv;
@@ -1117,7 +1158,7 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
}
/* Sanity-check the IV size for stack allocations. */
- if (iv_size > MAX_IV_SIZE) {
+ if (iv_size > MAX_IV_SIZE || nonce_size > MAX_IV_SIZE) {
rc = -EINVAL;
goto free_priv;
}
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 48e8097339ab..a88551f3bc43 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -3,7 +3,7 @@
*
* Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net>
* Copyright 2013-2014 Intel Mobile Communications GmbH
- * Copyright 2015 Intel Deutschland GmbH
+ * Copyright 2015-2017 Intel Deutschland GmbH
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -744,6 +744,8 @@ int wiphy_register(struct wiphy *wiphy)
/* sanity check supported bands/channels */
for (band = 0; band < NUM_NL80211_BANDS; band++) {
+ u16 types = 0;
+
sband = wiphy->bands[band];
if (!sband)
continue;
@@ -788,6 +790,23 @@ int wiphy_register(struct wiphy *wiphy)
sband->channels[i].band = band;
}
+ for (i = 0; i < sband->n_iftype_data; i++) {
+ const struct ieee80211_sband_iftype_data *iftd;
+
+ iftd = &sband->iftype_data[i];
+
+ if (WARN_ON(!iftd->types_mask))
+ return -EINVAL;
+ if (WARN_ON(types & iftd->types_mask))
+ return -EINVAL;
+
+ /* at least one piece of information must be present */
+ if (WARN_ON(!iftd->he_cap.has_he))
+ return -EINVAL;
+
+ types |= iftd->types_mask;
+ }
+
have_band = true;
}
diff --git a/net/wireless/core.h b/net/wireless/core.h
index 63eb1b5fdd04..7f52ef569320 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -76,7 +76,7 @@ struct cfg80211_registered_device {
struct cfg80211_scan_request *scan_req; /* protected by RTNL */
struct sk_buff *scan_msg;
struct list_head sched_scan_req_list;
- unsigned long suspend_at;
+ time64_t suspend_at;
struct work_struct scan_done_wk;
struct genl_info *cur_cmd_info;
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 4eece06be1e7..e4e5f025d16b 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -428,6 +428,8 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
[NL80211_ATTR_TXQ_LIMIT] = { .type = NLA_U32 },
[NL80211_ATTR_TXQ_MEMORY_LIMIT] = { .type = NLA_U32 },
[NL80211_ATTR_TXQ_QUANTUM] = { .type = NLA_U32 },
+ [NL80211_ATTR_HE_CAPABILITY] = { .type = NLA_BINARY,
+ .len = NL80211_HE_MAX_CAPABILITY_LEN },
};
/* policy for the key attributes */
@@ -1324,6 +1326,34 @@ static int nl80211_send_coalesce(struct sk_buff *msg,
return 0;
}
+static int
+nl80211_send_iftype_data(struct sk_buff *msg,
+ const struct ieee80211_sband_iftype_data *iftdata)
+{
+ const struct ieee80211_sta_he_cap *he_cap = &iftdata->he_cap;
+
+ if (nl80211_put_iftypes(msg, NL80211_BAND_IFTYPE_ATTR_IFTYPES,
+ iftdata->types_mask))
+ return -ENOBUFS;
+
+ if (he_cap->has_he) {
+ if (nla_put(msg, NL80211_BAND_IFTYPE_ATTR_HE_CAP_MAC,
+ sizeof(he_cap->he_cap_elem.mac_cap_info),
+ he_cap->he_cap_elem.mac_cap_info) ||
+ nla_put(msg, NL80211_BAND_IFTYPE_ATTR_HE_CAP_PHY,
+ sizeof(he_cap->he_cap_elem.phy_cap_info),
+ he_cap->he_cap_elem.phy_cap_info) ||
+ nla_put(msg, NL80211_BAND_IFTYPE_ATTR_HE_CAP_MCS_SET,
+ sizeof(he_cap->he_mcs_nss_supp),
+ &he_cap->he_mcs_nss_supp) ||
+ nla_put(msg, NL80211_BAND_IFTYPE_ATTR_HE_CAP_PPE,
+ sizeof(he_cap->ppe_thres), he_cap->ppe_thres))
+ return -ENOBUFS;
+ }
+
+ return 0;
+}
+
static int nl80211_send_band_rateinfo(struct sk_buff *msg,
struct ieee80211_supported_band *sband)
{
@@ -1353,6 +1383,32 @@ static int nl80211_send_band_rateinfo(struct sk_buff *msg,
sband->vht_cap.cap)))
return -ENOBUFS;
+ if (sband->n_iftype_data) {
+ struct nlattr *nl_iftype_data =
+ nla_nest_start(msg, NL80211_BAND_ATTR_IFTYPE_DATA);
+ int err;
+
+ if (!nl_iftype_data)
+ return -ENOBUFS;
+
+ for (i = 0; i < sband->n_iftype_data; i++) {
+ struct nlattr *iftdata;
+
+ iftdata = nla_nest_start(msg, i + 1);
+ if (!iftdata)
+ return -ENOBUFS;
+
+ err = nl80211_send_iftype_data(msg,
+ &sband->iftype_data[i]);
+ if (err)
+ return err;
+
+ nla_nest_end(msg, iftdata);
+ }
+
+ nla_nest_end(msg, nl_iftype_data);
+ }
+
/* add bitrates */
nl_rates = nla_nest_start(msg, NL80211_BAND_ATTR_RATES);
if (!nl_rates)
@@ -2757,7 +2813,8 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag
nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, wdev_address(wdev)) ||
nla_put_u32(msg, NL80211_ATTR_GENERATION,
rdev->devlist_generation ^
- (cfg80211_rdev_list_generation << 2)))
+ (cfg80211_rdev_list_generation << 2)) ||
+ nla_put_u8(msg, NL80211_ATTR_4ADDR, wdev->use_4addr))
goto nla_put_failure;
if (rdev->ops->get_channel) {
@@ -4471,6 +4528,9 @@ static bool nl80211_put_sta_rate(struct sk_buff *msg, struct rate_info *info,
case RATE_INFO_BW_160:
rate_flg = NL80211_RATE_INFO_160_MHZ_WIDTH;
break;
+ case RATE_INFO_BW_HE_RU:
+ rate_flg = 0;
+ WARN_ON(!(info->flags & RATE_INFO_FLAGS_HE_MCS));
}
if (rate_flg && nla_put_flag(msg, rate_flg))
@@ -4490,6 +4550,19 @@ static bool nl80211_put_sta_rate(struct sk_buff *msg, struct rate_info *info,
if (info->flags & RATE_INFO_FLAGS_SHORT_GI &&
nla_put_flag(msg, NL80211_RATE_INFO_SHORT_GI))
return false;
+ } else if (info->flags & RATE_INFO_FLAGS_HE_MCS) {
+ if (nla_put_u8(msg, NL80211_RATE_INFO_HE_MCS, info->mcs))
+ return false;
+ if (nla_put_u8(msg, NL80211_RATE_INFO_HE_NSS, info->nss))
+ return false;
+ if (nla_put_u8(msg, NL80211_RATE_INFO_HE_GI, info->he_gi))
+ return false;
+ if (nla_put_u8(msg, NL80211_RATE_INFO_HE_DCM, info->he_dcm))
+ return false;
+ if (info->bw == RATE_INFO_BW_HE_RU &&
+ nla_put_u8(msg, NL80211_RATE_INFO_HE_RU_ALLOC,
+ info->he_ru_alloc))
+ return false;
}
nla_nest_end(msg, rate);
@@ -4546,13 +4619,13 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid,
#define PUT_SINFO(attr, memb, type) do { \
BUILD_BUG_ON(sizeof(type) == sizeof(u64)); \
- if (sinfo->filled & (1ULL << NL80211_STA_INFO_ ## attr) && \
+ if (sinfo->filled & BIT_ULL(NL80211_STA_INFO_ ## attr) && \
nla_put_ ## type(msg, NL80211_STA_INFO_ ## attr, \
sinfo->memb)) \
goto nla_put_failure; \
} while (0)
#define PUT_SINFO_U64(attr, memb) do { \
- if (sinfo->filled & (1ULL << NL80211_STA_INFO_ ## attr) && \
+ if (sinfo->filled & BIT_ULL(NL80211_STA_INFO_ ## attr) && \
nla_put_u64_64bit(msg, NL80211_STA_INFO_ ## attr, \
sinfo->memb, NL80211_STA_INFO_PAD)) \
goto nla_put_failure; \
@@ -4561,14 +4634,14 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid,
PUT_SINFO(CONNECTED_TIME, connected_time, u32);
PUT_SINFO(INACTIVE_TIME, inactive_time, u32);
- if (sinfo->filled & (BIT(NL80211_STA_INFO_RX_BYTES) |
- BIT(NL80211_STA_INFO_RX_BYTES64)) &&
+ if (sinfo->filled & (BIT_ULL(NL80211_STA_INFO_RX_BYTES) |
+ BIT_ULL(NL80211_STA_INFO_RX_BYTES64)) &&
nla_put_u32(msg, NL80211_STA_INFO_RX_BYTES,
(u32)sinfo->rx_bytes))
goto nla_put_failure;
- if (sinfo->filled & (BIT(NL80211_STA_INFO_TX_BYTES) |
- BIT(NL80211_STA_INFO_TX_BYTES64)) &&
+ if (sinfo->filled & (BIT_ULL(NL80211_STA_INFO_TX_BYTES) |
+ BIT_ULL(NL80211_STA_INFO_TX_BYTES64)) &&
nla_put_u32(msg, NL80211_STA_INFO_TX_BYTES,
(u32)sinfo->tx_bytes))
goto nla_put_failure;
@@ -4588,24 +4661,24 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid,
default:
break;
}
- if (sinfo->filled & BIT(NL80211_STA_INFO_CHAIN_SIGNAL)) {
+ if (sinfo->filled & BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL)) {
if (!nl80211_put_signal(msg, sinfo->chains,
sinfo->chain_signal,
NL80211_STA_INFO_CHAIN_SIGNAL))
goto nla_put_failure;
}
- if (sinfo->filled & BIT(NL80211_STA_INFO_CHAIN_SIGNAL_AVG)) {
+ if (sinfo->filled & BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL_AVG)) {
if (!nl80211_put_signal(msg, sinfo->chains,
sinfo->chain_signal_avg,
NL80211_STA_INFO_CHAIN_SIGNAL_AVG))
goto nla_put_failure;
}
- if (sinfo->filled & BIT(NL80211_STA_INFO_TX_BITRATE)) {
+ if (sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_BITRATE)) {
if (!nl80211_put_sta_rate(msg, &sinfo->txrate,
NL80211_STA_INFO_TX_BITRATE))
goto nla_put_failure;
}
- if (sinfo->filled & BIT(NL80211_STA_INFO_RX_BITRATE)) {
+ if (sinfo->filled & BIT_ULL(NL80211_STA_INFO_RX_BITRATE)) {
if (!nl80211_put_sta_rate(msg, &sinfo->rxrate,
NL80211_STA_INFO_RX_BITRATE))
goto nla_put_failure;
@@ -4621,7 +4694,7 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid,
PUT_SINFO(PEER_PM, peer_pm, u32);
PUT_SINFO(NONPEER_PM, nonpeer_pm, u32);
- if (sinfo->filled & BIT(NL80211_STA_INFO_BSS_PARAM)) {
+ if (sinfo->filled & BIT_ULL(NL80211_STA_INFO_BSS_PARAM)) {
bss_param = nla_nest_start(msg, NL80211_STA_INFO_BSS_PARAM);
if (!bss_param)
goto nla_put_failure;
@@ -4640,7 +4713,7 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid,
nla_nest_end(msg, bss_param);
}
- if ((sinfo->filled & BIT(NL80211_STA_INFO_STA_FLAGS)) &&
+ if ((sinfo->filled & BIT_ULL(NL80211_STA_INFO_STA_FLAGS)) &&
nla_put(msg, NL80211_STA_INFO_STA_FLAGS,
sizeof(struct nl80211_sta_flag_update),
&sinfo->sta_flags))
@@ -4886,7 +4959,8 @@ int cfg80211_check_station_change(struct wiphy *wiphy,
return -EINVAL;
if (params->supported_rates)
return -EINVAL;
- if (params->ext_capab || params->ht_capa || params->vht_capa)
+ if (params->ext_capab || params->ht_capa || params->vht_capa ||
+ params->he_capa)
return -EINVAL;
}
@@ -5092,6 +5166,15 @@ static int nl80211_set_station_tdls(struct genl_info *info,
if (info->attrs[NL80211_ATTR_VHT_CAPABILITY])
params->vht_capa =
nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY]);
+ if (info->attrs[NL80211_ATTR_HE_CAPABILITY]) {
+ params->he_capa =
+ nla_data(info->attrs[NL80211_ATTR_HE_CAPABILITY]);
+ params->he_capa_len =
+ nla_len(info->attrs[NL80211_ATTR_HE_CAPABILITY]);
+
+ if (params->he_capa_len < NL80211_HE_MIN_CAPABILITY_LEN)
+ return -EINVAL;
+ }
err = nl80211_parse_sta_channel_info(info, params);
if (err)
@@ -5319,6 +5402,17 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
params.vht_capa =
nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY]);
+ if (info->attrs[NL80211_ATTR_HE_CAPABILITY]) {
+ params.he_capa =
+ nla_data(info->attrs[NL80211_ATTR_HE_CAPABILITY]);
+ params.he_capa_len =
+ nla_len(info->attrs[NL80211_ATTR_HE_CAPABILITY]);
+
+ /* max len is validated in nla policy */
+ if (params.he_capa_len < NL80211_HE_MIN_CAPABILITY_LEN)
+ return -EINVAL;
+ }
+
if (info->attrs[NL80211_ATTR_OPMODE_NOTIF]) {
params.opmode_notif_used = true;
params.opmode_notif =
@@ -5351,6 +5445,10 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
if (!(params.sta_flags_set & BIT(NL80211_STA_FLAG_WME))) {
params.ht_capa = NULL;
params.vht_capa = NULL;
+
+ /* HE requires WME */
+ if (params.he_capa_len)
+ return -EINVAL;
}
/* When you run into this, adjust the code below for the new flag */
@@ -6848,6 +6946,16 @@ static bool cfg80211_off_channel_oper_allowed(struct wireless_dev *wdev)
return regulatory_pre_cac_allowed(wdev->wiphy);
}
+static bool nl80211_check_scan_feat(struct wiphy *wiphy, u32 flags, u32 flag,
+ enum nl80211_ext_feature_index feat)
+{
+ if (!(flags & flag))
+ return true;
+ if (wiphy_ext_feature_isset(wiphy, feat))
+ return true;
+ return false;
+}
+
static int
nl80211_check_scan_flags(struct wiphy *wiphy, struct wireless_dev *wdev,
void *request, struct nlattr **attrs,
@@ -6882,15 +6990,33 @@ nl80211_check_scan_flags(struct wiphy *wiphy, struct wireless_dev *wdev,
if (((*flags & NL80211_SCAN_FLAG_LOW_PRIORITY) &&
!(wiphy->features & NL80211_FEATURE_LOW_PRIORITY_SCAN)) ||
- ((*flags & NL80211_SCAN_FLAG_LOW_SPAN) &&
- !wiphy_ext_feature_isset(wiphy,
- NL80211_EXT_FEATURE_LOW_SPAN_SCAN)) ||
- ((*flags & NL80211_SCAN_FLAG_LOW_POWER) &&
- !wiphy_ext_feature_isset(wiphy,
- NL80211_EXT_FEATURE_LOW_POWER_SCAN)) ||
- ((*flags & NL80211_SCAN_FLAG_HIGH_ACCURACY) &&
- !wiphy_ext_feature_isset(wiphy,
- NL80211_EXT_FEATURE_HIGH_ACCURACY_SCAN)))
+ !nl80211_check_scan_feat(wiphy, *flags,
+ NL80211_SCAN_FLAG_LOW_SPAN,
+ NL80211_EXT_FEATURE_LOW_SPAN_SCAN) ||
+ !nl80211_check_scan_feat(wiphy, *flags,
+ NL80211_SCAN_FLAG_LOW_POWER,
+ NL80211_EXT_FEATURE_LOW_POWER_SCAN) ||
+ !nl80211_check_scan_feat(wiphy, *flags,
+ NL80211_SCAN_FLAG_HIGH_ACCURACY,
+ NL80211_EXT_FEATURE_HIGH_ACCURACY_SCAN) ||
+ !nl80211_check_scan_feat(wiphy, *flags,
+ NL80211_SCAN_FLAG_FILS_MAX_CHANNEL_TIME,
+ NL80211_EXT_FEATURE_FILS_MAX_CHANNEL_TIME) ||
+ !nl80211_check_scan_feat(wiphy, *flags,
+ NL80211_SCAN_FLAG_ACCEPT_BCAST_PROBE_RESP,
+ NL80211_EXT_FEATURE_ACCEPT_BCAST_PROBE_RESP) ||
+ !nl80211_check_scan_feat(wiphy, *flags,
+ NL80211_SCAN_FLAG_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION,
+ NL80211_EXT_FEATURE_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION) ||
+ !nl80211_check_scan_feat(wiphy, *flags,
+ NL80211_SCAN_FLAG_OCE_PROBE_REQ_HIGH_TX_RATE,
+ NL80211_EXT_FEATURE_OCE_PROBE_REQ_HIGH_TX_RATE) ||
+ !nl80211_check_scan_feat(wiphy, *flags,
+ NL80211_SCAN_FLAG_RANDOM_SN,
+ NL80211_EXT_FEATURE_SCAN_RANDOM_SN) ||
+ !nl80211_check_scan_feat(wiphy, *flags,
+ NL80211_SCAN_FLAG_MIN_PREQ_CONTENT,
+ NL80211_EXT_FEATURE_SCAN_MIN_PREQ_CONTENT))
return -EOPNOTSUPP;
if (*flags & NL80211_SCAN_FLAG_RANDOM_ADDR) {
@@ -6905,26 +7031,6 @@ nl80211_check_scan_flags(struct wiphy *wiphy, struct wireless_dev *wdev,
return err;
}
- if ((*flags & NL80211_SCAN_FLAG_FILS_MAX_CHANNEL_TIME) &&
- !wiphy_ext_feature_isset(wiphy,
- NL80211_EXT_FEATURE_FILS_MAX_CHANNEL_TIME))
- return -EOPNOTSUPP;
-
- if ((*flags & NL80211_SCAN_FLAG_ACCEPT_BCAST_PROBE_RESP) &&
- !wiphy_ext_feature_isset(wiphy,
- NL80211_EXT_FEATURE_ACCEPT_BCAST_PROBE_RESP))
- return -EOPNOTSUPP;
-
- if ((*flags & NL80211_SCAN_FLAG_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION) &&
- !wiphy_ext_feature_isset(wiphy,
- NL80211_EXT_FEATURE_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION))
- return -EOPNOTSUPP;
-
- if ((*flags & NL80211_SCAN_FLAG_OCE_PROBE_REQ_HIGH_TX_RATE) &&
- !wiphy_ext_feature_isset(wiphy,
- NL80211_EXT_FEATURE_OCE_PROBE_REQ_HIGH_TX_RATE))
- return -EOPNOTSUPP;
-
return 0;
}
@@ -10147,7 +10253,7 @@ static int cfg80211_cqm_rssi_update(struct cfg80211_registered_device *rdev,
if (err)
return err;
- if (sinfo.filled & BIT(NL80211_STA_INFO_BEACON_SIGNAL_AVG))
+ if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_BEACON_SIGNAL_AVG))
wdev->cqm_config->last_rssi_event_value =
(s8) sinfo.rx_beacon_signal_avg;
}
diff --git a/net/wireless/sysfs.c b/net/wireless/sysfs.c
index 570a2b67ca10..6ab32f6a1961 100644
--- a/net/wireless/sysfs.c
+++ b/net/wireless/sysfs.c
@@ -102,7 +102,7 @@ static int wiphy_suspend(struct device *dev)
struct cfg80211_registered_device *rdev = dev_to_rdev(dev);
int ret = 0;
- rdev->suspend_at = get_seconds();
+ rdev->suspend_at = ktime_get_boottime_seconds();
rtnl_lock();
if (rdev->wiphy.registered) {
@@ -130,7 +130,7 @@ static int wiphy_resume(struct device *dev)
int ret = 0;
/* Age scan results with time spent in suspend */
- cfg80211_bss_age(rdev, get_seconds() - rdev->suspend_at);
+ cfg80211_bss_age(rdev, ktime_get_boottime_seconds() - rdev->suspend_at);
rtnl_lock();
if (rdev->wiphy.registered && rdev->ops->resume)
diff --git a/net/wireless/util.c b/net/wireless/util.c
index 3c654cd7ba56..e0825a019e9f 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -4,6 +4,7 @@
*
* Copyright 2007-2009 Johannes Berg <johannes@sipsolutions.net>
* Copyright 2013-2014 Intel Mobile Communications GmbH
+ * Copyright 2017 Intel Deutschland GmbH
*/
#include <linux/export.h>
#include <linux/bitops.h>
@@ -1142,6 +1143,85 @@ static u32 cfg80211_calculate_bitrate_vht(struct rate_info *rate)
return 0;
}
+static u32 cfg80211_calculate_bitrate_he(struct rate_info *rate)
+{
+#define SCALE 2048
+ u16 mcs_divisors[12] = {
+ 34133, /* 16.666666... */
+ 17067, /* 8.333333... */
+ 11378, /* 5.555555... */
+ 8533, /* 4.166666... */
+ 5689, /* 2.777777... */
+ 4267, /* 2.083333... */
+ 3923, /* 1.851851... */
+ 3413, /* 1.666666... */
+ 2844, /* 1.388888... */
+ 2560, /* 1.250000... */
+ 2276, /* 1.111111... */
+ 2048, /* 1.000000... */
+ };
+ u32 rates_160M[3] = { 960777777, 907400000, 816666666 };
+ u32 rates_969[3] = { 480388888, 453700000, 408333333 };
+ u32 rates_484[3] = { 229411111, 216666666, 195000000 };
+ u32 rates_242[3] = { 114711111, 108333333, 97500000 };
+ u32 rates_106[3] = { 40000000, 37777777, 34000000 };
+ u32 rates_52[3] = { 18820000, 17777777, 16000000 };
+ u32 rates_26[3] = { 9411111, 8888888, 8000000 };
+ u64 tmp;
+ u32 result;
+
+ if (WARN_ON_ONCE(rate->mcs > 11))
+ return 0;
+
+ if (WARN_ON_ONCE(rate->he_gi > NL80211_RATE_INFO_HE_GI_3_2))
+ return 0;
+ if (WARN_ON_ONCE(rate->he_ru_alloc >
+ NL80211_RATE_INFO_HE_RU_ALLOC_2x996))
+ return 0;
+ if (WARN_ON_ONCE(rate->nss < 1 || rate->nss > 8))
+ return 0;
+
+ if (rate->bw == RATE_INFO_BW_160)
+ result = rates_160M[rate->he_gi];
+ else if (rate->bw == RATE_INFO_BW_80 ||
+ (rate->bw == RATE_INFO_BW_HE_RU &&
+ rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_996))
+ result = rates_969[rate->he_gi];
+ else if (rate->bw == RATE_INFO_BW_40 ||
+ (rate->bw == RATE_INFO_BW_HE_RU &&
+ rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_484))
+ result = rates_484[rate->he_gi];
+ else if (rate->bw == RATE_INFO_BW_20 ||
+ (rate->bw == RATE_INFO_BW_HE_RU &&
+ rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_242))
+ result = rates_242[rate->he_gi];
+ else if (rate->bw == RATE_INFO_BW_HE_RU &&
+ rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_106)
+ result = rates_106[rate->he_gi];
+ else if (rate->bw == RATE_INFO_BW_HE_RU &&
+ rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_52)
+ result = rates_52[rate->he_gi];
+ else if (rate->bw == RATE_INFO_BW_HE_RU &&
+ rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_26)
+ result = rates_26[rate->he_gi];
+ else if (WARN(1, "invalid HE MCS: bw:%d, ru:%d\n",
+ rate->bw, rate->he_ru_alloc))
+ return 0;
+
+ /* now scale to the appropriate MCS */
+ tmp = result;
+ tmp *= SCALE;
+ do_div(tmp, mcs_divisors[rate->mcs]);
+ result = tmp;
+
+ /* and take NSS, DCM into account */
+ result = (result * rate->nss) / 8;
+ if (rate->he_dcm)
+ result /= 2;
+
+ return result;
+}
+
u32 cfg80211_calculate_bitrate(struct rate_info *rate)
{
if (rate->flags & RATE_INFO_FLAGS_MCS)
@@ -1150,6 +1230,8 @@ u32 cfg80211_calculate_bitrate(struct rate_info *rate)
return cfg80211_calculate_bitrate_60g(rate);
if (rate->flags & RATE_INFO_FLAGS_VHT_MCS)
return cfg80211_calculate_bitrate_vht(rate);
+ if (rate->flags & RATE_INFO_FLAGS_HE_MCS)
+ return cfg80211_calculate_bitrate_he(rate);
return rate->legacy;
}
@@ -1791,8 +1873,9 @@ bool cfg80211_does_bw_fit_range(const struct ieee80211_freq_range *freq_range,
int cfg80211_sinfo_alloc_tid_stats(struct station_info *sinfo, gfp_t gfp)
{
- sinfo->pertid = kcalloc(sizeof(*(sinfo->pertid)),
- IEEE80211_NUM_TIDS + 1, gfp);
+ sinfo->pertid = kcalloc(IEEE80211_NUM_TIDS + 1,
+ sizeof(*(sinfo->pertid)),
+ gfp);
if (!sinfo->pertid)
return -ENOMEM;
diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c
index 05186a47878f..167f7025ac98 100644
--- a/net/wireless/wext-compat.c
+++ b/net/wireless/wext-compat.c
@@ -1278,7 +1278,7 @@ static int cfg80211_wext_giwrate(struct net_device *dev,
if (err)
return err;
- if (!(sinfo.filled & BIT(NL80211_STA_INFO_TX_BITRATE)))
+ if (!(sinfo.filled & BIT_ULL(NL80211_STA_INFO_TX_BITRATE)))
return -EOPNOTSUPP;
rate->value = 100000 * cfg80211_calculate_bitrate(&sinfo.txrate);
@@ -1320,7 +1320,7 @@ static struct iw_statistics *cfg80211_wireless_stats(struct net_device *dev)
switch (rdev->wiphy.signal_type) {
case CFG80211_SIGNAL_TYPE_MBM:
- if (sinfo.filled & BIT(NL80211_STA_INFO_SIGNAL)) {
+ if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_SIGNAL)) {
int sig = sinfo.signal;
wstats.qual.updated |= IW_QUAL_LEVEL_UPDATED;
wstats.qual.updated |= IW_QUAL_QUAL_UPDATED;
@@ -1334,7 +1334,7 @@ static struct iw_statistics *cfg80211_wireless_stats(struct net_device *dev)
break;
}
case CFG80211_SIGNAL_TYPE_UNSPEC:
- if (sinfo.filled & BIT(NL80211_STA_INFO_SIGNAL)) {
+ if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_SIGNAL)) {
wstats.qual.updated |= IW_QUAL_LEVEL_UPDATED;
wstats.qual.updated |= IW_QUAL_QUAL_UPDATED;
wstats.qual.level = sinfo.signal;
@@ -1347,9 +1347,9 @@ static struct iw_statistics *cfg80211_wireless_stats(struct net_device *dev)
}
wstats.qual.updated |= IW_QUAL_NOISE_INVALID;
- if (sinfo.filled & BIT(NL80211_STA_INFO_RX_DROP_MISC))
+ if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_RX_DROP_MISC))
wstats.discard.misc = sinfo.rx_dropped_misc;
- if (sinfo.filled & BIT(NL80211_STA_INFO_TX_FAILED))
+ if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_TX_FAILED))
wstats.discard.retries = sinfo.tx_failed;
return &wstats;