summaryrefslogtreecommitdiffstats
path: root/include/net
diff options
context:
space:
mode:
Diffstat (limited to 'include/net')
-rw-r--r--include/net/bluetooth/hci.h3
-rw-r--r--include/net/bluetooth/hci_core.h40
-rw-r--r--include/net/bluetooth/hci_mon.h2
-rw-r--r--include/net/bluetooth/hci_sync.h2
-rw-r--r--include/net/devlink.h60
-rw-r--r--include/net/dsa.h56
-rw-r--r--include/net/dsa_stubs.h22
-rw-r--r--include/net/dst.h7
-rw-r--r--include/net/inet_connection_sock.h1
-rw-r--r--include/net/inet_sock.h1
-rw-r--r--include/net/inet_timewait_sock.h3
-rw-r--r--include/net/ip.h1
-rw-r--r--include/net/ipv6.h6
-rw-r--r--include/net/net_namespace.h15
-rw-r--r--include/net/netfilter/nf_conntrack_labels.h2
-rw-r--r--include/net/netfilter/nf_flow_table.h1
-rw-r--r--include/net/netfilter/nf_tables.h62
-rw-r--r--include/net/netkit.h38
-rw-r--r--include/net/netlink.h73
-rw-r--r--include/net/netns/conntrack.h2
-rw-r--r--include/net/netns/xfrm.h1
-rw-r--r--include/net/page_pool/helpers.h210
-rw-r--r--include/net/page_pool/types.h6
-rw-r--r--include/net/sock.h10
-rw-r--r--include/net/tc_act/tc_ct.h1
-rw-r--r--include/net/tcp.h74
-rw-r--r--include/net/tcx.h7
-rw-r--r--include/net/udp_tunnel.h8
-rw-r--r--include/net/xdp_sock.h16
29 files changed, 537 insertions, 193 deletions
diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index 87d92accc26e..bdee5d649cc6 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -1,6 +1,7 @@
/*
BlueZ - Bluetooth protocol stack for Linux
Copyright (C) 2000-2001 Qualcomm Incorporated
+ Copyright 2023 NXP
Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com>
@@ -673,6 +674,8 @@ enum {
#define HCI_TX_POWER_INVALID 127
#define HCI_RSSI_INVALID 127
+#define HCI_SYNC_HANDLE_INVALID 0xffff
+
#define HCI_ROLE_MASTER 0x00
#define HCI_ROLE_SLAVE 0x01
diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index c33348ba1657..20988623c5cc 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -350,6 +350,8 @@ struct hci_dev {
struct list_head list;
struct mutex lock;
+ struct ida unset_handle_ida;
+
const char *name;
unsigned long flags;
__u16 id;
@@ -1290,8 +1292,8 @@ static inline struct hci_conn *hci_conn_hash_lookup_big(struct hci_dev *hdev,
return NULL;
}
-static inline struct hci_conn *hci_conn_hash_lookup_big_any_dst(struct hci_dev *hdev,
- __u8 handle)
+static inline struct hci_conn *
+hci_conn_hash_lookup_pa_sync_big_handle(struct hci_dev *hdev, __u8 big)
{
struct hci_conn_hash *h = &hdev->conn_hash;
struct hci_conn *c;
@@ -1299,22 +1301,22 @@ static inline struct hci_conn *hci_conn_hash_lookup_big_any_dst(struct hci_dev *
rcu_read_lock();
list_for_each_entry_rcu(c, &h->list, list) {
- if (c->type != ISO_LINK)
+ if (c->type != ISO_LINK ||
+ !test_bit(HCI_CONN_PA_SYNC, &c->flags))
continue;
- if (handle != BT_ISO_QOS_BIG_UNSET && handle == c->iso_qos.bcast.big) {
+ if (c->iso_qos.bcast.big == big) {
rcu_read_unlock();
return c;
}
}
-
rcu_read_unlock();
return NULL;
}
static inline struct hci_conn *
-hci_conn_hash_lookup_pa_sync(struct hci_dev *hdev, __u8 big)
+hci_conn_hash_lookup_pa_sync_handle(struct hci_dev *hdev, __u16 sync_handle)
{
struct hci_conn_hash *h = &hdev->conn_hash;
struct hci_conn *c;
@@ -1326,7 +1328,7 @@ hci_conn_hash_lookup_pa_sync(struct hci_dev *hdev, __u8 big)
!test_bit(HCI_CONN_PA_SYNC, &c->flags))
continue;
- if (c->iso_qos.bcast.big == big) {
+ if (c->sync_handle == sync_handle) {
rcu_read_unlock();
return c;
}
@@ -1377,6 +1379,26 @@ static inline void hci_conn_hash_list_state(struct hci_dev *hdev,
rcu_read_unlock();
}
+static inline void hci_conn_hash_list_flag(struct hci_dev *hdev,
+ hci_conn_func_t func, __u8 type,
+ __u8 flag, void *data)
+{
+ struct hci_conn_hash *h = &hdev->conn_hash;
+ struct hci_conn *c;
+
+ if (!func)
+ return;
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(c, &h->list, list) {
+ if (c->type == type && test_bit(flag, &c->flags))
+ func(c, data);
+ }
+
+ rcu_read_unlock();
+}
+
static inline struct hci_conn *hci_lookup_le_connect(struct hci_dev *hdev)
{
struct hci_conn_hash *h = &hdev->conn_hash;
@@ -1426,7 +1448,9 @@ int hci_le_create_cis_pending(struct hci_dev *hdev);
int hci_conn_check_create_cis(struct hci_conn *conn);
struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst,
- u8 role);
+ u8 role, u16 handle);
+struct hci_conn *hci_conn_add_unset(struct hci_dev *hdev, int type,
+ bdaddr_t *dst, u8 role);
void hci_conn_del(struct hci_conn *conn);
void hci_conn_hash_flush(struct hci_dev *hdev);
void hci_conn_check_pending(struct hci_dev *hdev);
diff --git a/include/net/bluetooth/hci_mon.h b/include/net/bluetooth/hci_mon.h
index 2d5fcda1bcd0..082f89531b88 100644
--- a/include/net/bluetooth/hci_mon.h
+++ b/include/net/bluetooth/hci_mon.h
@@ -56,7 +56,7 @@ struct hci_mon_new_index {
__u8 type;
__u8 bus;
bdaddr_t bdaddr;
- char name[8];
+ char name[8] __nonstring;
} __packed;
#define HCI_MON_NEW_INDEX_SIZE 16
diff --git a/include/net/bluetooth/hci_sync.h b/include/net/bluetooth/hci_sync.h
index 57eeb07aeb25..6efbc2152146 100644
--- a/include/net/bluetooth/hci_sync.h
+++ b/include/net/bluetooth/hci_sync.h
@@ -80,6 +80,8 @@ int hci_start_per_adv_sync(struct hci_dev *hdev, u8 instance, u8 data_len,
u8 *data, u32 flags, u16 min_interval,
u16 max_interval, u16 sync_interval);
+int hci_disable_per_advertising_sync(struct hci_dev *hdev, u8 instance);
+
int hci_remove_advertising_sync(struct hci_dev *hdev, struct sock *sk,
u8 instance, bool force);
int hci_disable_advertising_sync(struct hci_dev *hdev);
diff --git a/include/net/devlink.h b/include/net/devlink.h
index fad8e36e3d98..9ac394bdfbe4 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -1854,36 +1854,36 @@ int devlink_info_version_running_put_ext(struct devlink_info_req *req,
const char *version_value,
enum devlink_info_version_type version_type);
-int devlink_fmsg_obj_nest_start(struct devlink_fmsg *fmsg);
-int devlink_fmsg_obj_nest_end(struct devlink_fmsg *fmsg);
-
-int devlink_fmsg_pair_nest_start(struct devlink_fmsg *fmsg, const char *name);
-int devlink_fmsg_pair_nest_end(struct devlink_fmsg *fmsg);
-
-int devlink_fmsg_arr_pair_nest_start(struct devlink_fmsg *fmsg,
- const char *name);
-int devlink_fmsg_arr_pair_nest_end(struct devlink_fmsg *fmsg);
-int devlink_fmsg_binary_pair_nest_start(struct devlink_fmsg *fmsg,
- const char *name);
-int devlink_fmsg_binary_pair_nest_end(struct devlink_fmsg *fmsg);
-
-int devlink_fmsg_u32_put(struct devlink_fmsg *fmsg, u32 value);
-int devlink_fmsg_string_put(struct devlink_fmsg *fmsg, const char *value);
-int devlink_fmsg_binary_put(struct devlink_fmsg *fmsg, const void *value,
- u16 value_len);
-
-int devlink_fmsg_bool_pair_put(struct devlink_fmsg *fmsg, const char *name,
- bool value);
-int devlink_fmsg_u8_pair_put(struct devlink_fmsg *fmsg, const char *name,
- u8 value);
-int devlink_fmsg_u32_pair_put(struct devlink_fmsg *fmsg, const char *name,
- u32 value);
-int devlink_fmsg_u64_pair_put(struct devlink_fmsg *fmsg, const char *name,
- u64 value);
-int devlink_fmsg_string_pair_put(struct devlink_fmsg *fmsg, const char *name,
- const char *value);
-int devlink_fmsg_binary_pair_put(struct devlink_fmsg *fmsg, const char *name,
- const void *value, u32 value_len);
+void devlink_fmsg_obj_nest_start(struct devlink_fmsg *fmsg);
+void devlink_fmsg_obj_nest_end(struct devlink_fmsg *fmsg);
+
+void devlink_fmsg_pair_nest_start(struct devlink_fmsg *fmsg, const char *name);
+void devlink_fmsg_pair_nest_end(struct devlink_fmsg *fmsg);
+
+void devlink_fmsg_arr_pair_nest_start(struct devlink_fmsg *fmsg,
+ const char *name);
+void devlink_fmsg_arr_pair_nest_end(struct devlink_fmsg *fmsg);
+void devlink_fmsg_binary_pair_nest_start(struct devlink_fmsg *fmsg,
+ const char *name);
+void devlink_fmsg_binary_pair_nest_end(struct devlink_fmsg *fmsg);
+
+void devlink_fmsg_u32_put(struct devlink_fmsg *fmsg, u32 value);
+void devlink_fmsg_string_put(struct devlink_fmsg *fmsg, const char *value);
+void devlink_fmsg_binary_put(struct devlink_fmsg *fmsg, const void *value,
+ u16 value_len);
+
+void devlink_fmsg_bool_pair_put(struct devlink_fmsg *fmsg, const char *name,
+ bool value);
+void devlink_fmsg_u8_pair_put(struct devlink_fmsg *fmsg, const char *name,
+ u8 value);
+void devlink_fmsg_u32_pair_put(struct devlink_fmsg *fmsg, const char *name,
+ u32 value);
+void devlink_fmsg_u64_pair_put(struct devlink_fmsg *fmsg, const char *name,
+ u64 value);
+void devlink_fmsg_string_pair_put(struct devlink_fmsg *fmsg, const char *name,
+ const char *value);
+void devlink_fmsg_binary_pair_put(struct devlink_fmsg *fmsg, const char *name,
+ const void *value, u32 value_len);
struct devlink_health_reporter *
devl_port_health_reporter_create(struct devlink_port *port,
diff --git a/include/net/dsa.h b/include/net/dsa.h
index d98439ea6146..82135fbdb1e6 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -102,11 +102,11 @@ struct dsa_device_ops {
const char *name;
enum dsa_tag_protocol proto;
/* Some tagging protocols either mangle or shift the destination MAC
- * address, in which case the DSA master would drop packets on ingress
+ * address, in which case the DSA conduit would drop packets on ingress
* if what it understands out of the destination MAC address is not in
* its RX filter.
*/
- bool promisc_on_master;
+ bool promisc_on_conduit;
};
struct dsa_lag {
@@ -236,12 +236,12 @@ struct dsa_bridge {
};
struct dsa_port {
- /* A CPU port is physically connected to a master device.
- * A user port exposed to userspace has a slave device.
+ /* A CPU port is physically connected to a conduit device. A user port
+ * exposes a network device to user-space, called 'user' here.
*/
union {
- struct net_device *master;
- struct net_device *slave;
+ struct net_device *conduit;
+ struct net_device *user;
};
/* Copy of the tagging protocol operations, for quicker access
@@ -249,7 +249,7 @@ struct dsa_port {
*/
const struct dsa_device_ops *tag_ops;
- /* Copies for faster access in master receive hot path */
+ /* Copies for faster access in conduit receive hot path */
struct dsa_switch_tree *dst;
struct sk_buff *(*rcv)(struct sk_buff *skb, struct net_device *dev);
@@ -281,9 +281,9 @@ struct dsa_port {
u8 lag_tx_enabled:1;
- /* Master state bits, valid only on CPU ports */
- u8 master_admin_up:1;
- u8 master_oper_up:1;
+ /* conduit state bits, valid only on CPU ports */
+ u8 conduit_admin_up:1;
+ u8 conduit_oper_up:1;
/* Valid only on user ports */
u8 cpu_port_in_lag:1;
@@ -303,7 +303,7 @@ struct dsa_port {
struct list_head list;
/*
- * Original copy of the master netdev ethtool_ops
+ * Original copy of the conduit netdev ethtool_ops
*/
const struct ethtool_ops *orig_ethtool_ops;
@@ -452,10 +452,10 @@ struct dsa_switch {
const struct dsa_switch_ops *ops;
/*
- * Slave mii_bus and devices for the individual ports.
+ * User mii_bus and devices for the individual ports.
*/
u32 phys_mii_mask;
- struct mii_bus *slave_mii_bus;
+ struct mii_bus *user_mii_bus;
/* Ageing Time limits in msecs */
unsigned int ageing_time_min;
@@ -520,10 +520,10 @@ static inline bool dsa_port_is_unused(struct dsa_port *dp)
return dp->type == DSA_PORT_TYPE_UNUSED;
}
-static inline bool dsa_port_master_is_operational(struct dsa_port *dp)
+static inline bool dsa_port_conduit_is_operational(struct dsa_port *dp)
{
- return dsa_port_is_cpu(dp) && dp->master_admin_up &&
- dp->master_oper_up;
+ return dsa_port_is_cpu(dp) && dp->conduit_admin_up &&
+ dp->conduit_oper_up;
}
static inline bool dsa_is_unused_port(struct dsa_switch *ds, int p)
@@ -713,12 +713,12 @@ static inline bool dsa_port_offloads_lag(struct dsa_port *dp,
return dsa_port_lag_dev_get(dp) == lag->dev;
}
-static inline struct net_device *dsa_port_to_master(const struct dsa_port *dp)
+static inline struct net_device *dsa_port_to_conduit(const struct dsa_port *dp)
{
if (dp->cpu_port_in_lag)
return dsa_port_lag_dev_get(dp->cpu_dp);
- return dp->cpu_dp->master;
+ return dp->cpu_dp->conduit;
}
static inline
@@ -732,7 +732,7 @@ struct net_device *dsa_port_to_bridge_port(const struct dsa_port *dp)
else if (dp->hsr_dev)
return dp->hsr_dev;
- return dp->slave;
+ return dp->user;
}
static inline struct net_device *
@@ -834,9 +834,9 @@ struct dsa_switch_ops {
int (*connect_tag_protocol)(struct dsa_switch *ds,
enum dsa_tag_protocol proto);
- int (*port_change_master)(struct dsa_switch *ds, int port,
- struct net_device *master,
- struct netlink_ext_ack *extack);
+ int (*port_change_conduit)(struct dsa_switch *ds, int port,
+ struct net_device *conduit,
+ struct netlink_ext_ack *extack);
/* Optional switch-wide initialization and destruction methods */
int (*setup)(struct dsa_switch *ds);
@@ -1233,11 +1233,11 @@ struct dsa_switch_ops {
int (*tag_8021q_vlan_del)(struct dsa_switch *ds, int port, u16 vid);
/*
- * DSA master tracking operations
+ * DSA conduit tracking operations
*/
- void (*master_state_change)(struct dsa_switch *ds,
- const struct net_device *master,
- bool operational);
+ void (*conduit_state_change)(struct dsa_switch *ds,
+ const struct net_device *conduit,
+ bool operational);
};
#define DSA_DEVLINK_PARAM_DRIVER(_id, _name, _type, _cmodes) \
@@ -1374,9 +1374,9 @@ static inline int dsa_switch_resume(struct dsa_switch *ds)
#endif /* CONFIG_PM_SLEEP */
#if IS_ENABLED(CONFIG_NET_DSA)
-bool dsa_slave_dev_check(const struct net_device *dev);
+bool dsa_user_dev_check(const struct net_device *dev);
#else
-static inline bool dsa_slave_dev_check(const struct net_device *dev)
+static inline bool dsa_user_dev_check(const struct net_device *dev)
{
return false;
}
diff --git a/include/net/dsa_stubs.h b/include/net/dsa_stubs.h
index 361811750a54..6f384897f287 100644
--- a/include/net/dsa_stubs.h
+++ b/include/net/dsa_stubs.h
@@ -13,14 +13,14 @@
extern const struct dsa_stubs *dsa_stubs;
struct dsa_stubs {
- int (*master_hwtstamp_validate)(struct net_device *dev,
- const struct kernel_hwtstamp_config *config,
- struct netlink_ext_ack *extack);
+ int (*conduit_hwtstamp_validate)(struct net_device *dev,
+ const struct kernel_hwtstamp_config *config,
+ struct netlink_ext_ack *extack);
};
-static inline int dsa_master_hwtstamp_validate(struct net_device *dev,
- const struct kernel_hwtstamp_config *config,
- struct netlink_ext_ack *extack)
+static inline int dsa_conduit_hwtstamp_validate(struct net_device *dev,
+ const struct kernel_hwtstamp_config *config,
+ struct netlink_ext_ack *extack)
{
if (!netdev_uses_dsa(dev))
return 0;
@@ -29,18 +29,18 @@ static inline int dsa_master_hwtstamp_validate(struct net_device *dev,
* netdev_uses_dsa() returns true, the dsa_core module is still
* registered, and so, dsa_unregister_stubs() couldn't have run.
* For netdev_uses_dsa() to start returning false, it would imply that
- * dsa_master_teardown() has executed, which requires rtnl_lock().
+ * dsa_conduit_teardown() has executed, which requires rtnl_lock().
*/
ASSERT_RTNL();
- return dsa_stubs->master_hwtstamp_validate(dev, config, extack);
+ return dsa_stubs->conduit_hwtstamp_validate(dev, config, extack);
}
#else
-static inline int dsa_master_hwtstamp_validate(struct net_device *dev,
- const struct kernel_hwtstamp_config *config,
- struct netlink_ext_ack *extack)
+static inline int dsa_conduit_hwtstamp_validate(struct net_device *dev,
+ const struct kernel_hwtstamp_config *config,
+ struct netlink_ext_ack *extack)
{
return 0;
}
diff --git a/include/net/dst.h b/include/net/dst.h
index f8b8599a0600..f5dfc8fb7b37 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -222,13 +222,6 @@ static inline unsigned long dst_metric_rtt(const struct dst_entry *dst, int metr
return msecs_to_jiffies(dst_metric(dst, metric));
}
-static inline u32
-dst_allfrag(const struct dst_entry *dst)
-{
- int ret = dst_feature(dst, RTAX_FEATURE_ALLFRAG);
- return ret;
-}
-
static inline int
dst_metric_locked(const struct dst_entry *dst, int metric)
{
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 086d1193c9ef..d0a2f827d5f2 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -44,7 +44,6 @@ struct inet_connection_sock_af_ops {
struct request_sock *req_unhash,
bool *own_req);
u16 net_header_len;
- u16 net_frag_header_len;
u16 sockaddr_len;
int (*setsockopt)(struct sock *sk, int level, int optname,
sockptr_t optval, unsigned int optlen);
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 98e11958cdff..74db6d97cae1 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -244,7 +244,6 @@ struct inet_sock {
};
#define IPCORK_OPT 1 /* ip-options has been held in ipcork.opt */
-#define IPCORK_ALLFRAG 2 /* always fragment (for ipv6 for now) */
enum {
INET_FLAGS_PKTINFO = 0,
diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index 4a8e578405cb..b14999ff55db 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -67,7 +67,8 @@ struct inet_timewait_sock {
/* And these are ours. */
unsigned int tw_transparent : 1,
tw_flowlabel : 20,
- tw_pad : 3, /* 3 bits hole */
+ tw_usec_ts : 1,
+ tw_pad : 2, /* 2 bits hole */
tw_tos : 8;
u32 tw_txhash;
u32 tw_priority;
diff --git a/include/net/ip.h b/include/net/ip.h
index 6fbc0dcf4b97..1fc4c8d69e33 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -810,5 +810,6 @@ int ip_sock_set_mtu_discover(struct sock *sk, int val);
void ip_sock_set_pktinfo(struct sock *sk);
void ip_sock_set_recverr(struct sock *sk);
void ip_sock_set_tos(struct sock *sk, int val);
+void __ip_sock_set_tos(struct sock *sk, int val);
#endif /* _IP_H */
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index b3444c8a6f74..78d38dd88aba 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -1133,12 +1133,6 @@ struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, st
struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
const struct in6_addr *final_dst,
bool connected);
-struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
- struct net_device *dev,
- struct net *net, struct socket *sock,
- struct in6_addr *saddr,
- const struct ip_tunnel_info *info,
- u8 protocol, bool use_cache);
struct dst_entry *ip6_blackhole_route(struct net *net,
struct dst_entry *orig_dst);
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index eb6cd43b1746..13b3a4e29fdb 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -368,21 +368,30 @@ static inline void put_net_track(struct net *net, netns_tracker *tracker)
typedef struct {
#ifdef CONFIG_NET_NS
- struct net *net;
+ struct net __rcu *net;
#endif
} possible_net_t;
static inline void write_pnet(possible_net_t *pnet, struct net *net)
{
#ifdef CONFIG_NET_NS
- pnet->net = net;
+ rcu_assign_pointer(pnet->net, net);
#endif
}
static inline struct net *read_pnet(const possible_net_t *pnet)
{
#ifdef CONFIG_NET_NS
- return pnet->net;
+ return rcu_dereference_protected(pnet->net, true);
+#else
+ return &init_net;
+#endif
+}
+
+static inline struct net *read_pnet_rcu(possible_net_t *pnet)
+{
+#ifdef CONFIG_NET_NS
+ return rcu_dereference(pnet->net);
#else
return &init_net;
#endif
diff --git a/include/net/netfilter/nf_conntrack_labels.h b/include/net/netfilter/nf_conntrack_labels.h
index fcb19a4e8f2b..6903f72bcc15 100644
--- a/include/net/netfilter/nf_conntrack_labels.h
+++ b/include/net/netfilter/nf_conntrack_labels.h
@@ -39,7 +39,7 @@ static inline struct nf_conn_labels *nf_ct_labels_ext_add(struct nf_conn *ct)
#ifdef CONFIG_NF_CONNTRACK_LABELS
struct net *net = nf_ct_net(ct);
- if (net->ct.labels_used == 0)
+ if (atomic_read(&net->ct.labels_used) == 0)
return NULL;
return nf_ct_ext_add(ct, NF_CT_EXT_LABELS, GFP_ATOMIC);
diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
index d466e1a3b0b1..fe1507c1db82 100644
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -53,6 +53,7 @@ struct nf_flowtable_type {
struct list_head list;
int family;
int (*init)(struct nf_flowtable *ft);
+ bool (*gc)(const struct flow_offload *flow);
int (*setup)(struct nf_flowtable *ft,
struct net_device *dev,
enum flow_block_command cmd);
diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 9fb16485d08f..3bbd13ab1ecf 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -274,6 +274,9 @@ struct nft_userdata {
unsigned char data[];
};
+/* placeholder structure for opaque set element backend representation. */
+struct nft_elem_priv { };
+
/**
* struct nft_set_elem - generic representation of set elements
*
@@ -294,9 +297,14 @@ struct nft_set_elem {
u32 buf[NFT_DATA_VALUE_MAXLEN / sizeof(u32)];
struct nft_data val;
} data;
- void *priv;
+ struct nft_elem_priv *priv;
};
+static inline void *nft_elem_priv_cast(const struct nft_elem_priv *priv)
+{
+ return (void *)priv;
+}
+
struct nft_set;
struct nft_set_iter {
u8 genmask;
@@ -306,7 +314,7 @@ struct nft_set_iter {
int (*fn)(const struct nft_ctx *ctx,
struct nft_set *set,
const struct nft_set_iter *iter,
- struct nft_set_elem *elem);
+ struct nft_elem_priv *elem_priv);
};
/**
@@ -430,7 +438,8 @@ struct nft_set_ops {
const struct nft_set_ext **ext);
bool (*update)(struct nft_set *set,
const u32 *key,
- void *(*new)(struct nft_set *,
+ struct nft_elem_priv *
+ (*new)(struct nft_set *,
const struct nft_expr *,
struct nft_regs *),
const struct nft_expr *expr,
@@ -442,27 +451,27 @@ struct nft_set_ops {
int (*insert)(const struct net *net,
const struct nft_set *set,
const struct nft_set_elem *elem,
- struct nft_set_ext **ext);
+ struct nft_elem_priv **priv);
void (*activate)(const struct net *net,
const struct nft_set *set,
- const struct nft_set_elem *elem);
- void * (*deactivate)(const struct net *net,
+ struct nft_elem_priv *elem_priv);
+ struct nft_elem_priv * (*deactivate)(const struct net *net,
const struct nft_set *set,
const struct nft_set_elem *elem);
- bool (*flush)(const struct net *net,
+ void (*flush)(const struct net *net,
const struct nft_set *set,
- void *priv);
+ struct nft_elem_priv *priv);
void (*remove)(const struct net *net,
const struct nft_set *set,
- const struct nft_set_elem *elem);
+ struct nft_elem_priv *elem_priv);
void (*walk)(const struct nft_ctx *ctx,
struct nft_set *set,
struct nft_set_iter *iter);
- void * (*get)(const struct net *net,
+ struct nft_elem_priv * (*get)(const struct net *net,
const struct nft_set *set,
const struct nft_set_elem *elem,
unsigned int flags);
- void (*commit)(const struct nft_set *set);
+ void (*commit)(struct nft_set *set);
void (*abort)(const struct nft_set *set);
u64 (*privsize)(const struct nlattr * const nla[],
const struct nft_set_desc *desc);
@@ -796,9 +805,9 @@ static inline bool nft_set_elem_expired(const struct nft_set_ext *ext)
}
static inline struct nft_set_ext *nft_set_elem_ext(const struct nft_set *set,
- void *elem)
+ const struct nft_elem_priv *elem_priv)
{
- return elem + set->ops->elemsize;
+ return (void *)elem_priv + set->ops->elemsize;
}
static inline struct nft_object **nft_set_ext_obj(const struct nft_set_ext *ext)
@@ -810,16 +819,19 @@ struct nft_expr *nft_set_elem_expr_alloc(const struct nft_ctx *ctx,
const struct nft_set *set,
const struct nlattr *attr);
-void *nft_set_elem_init(const struct nft_set *set,
- const struct nft_set_ext_tmpl *tmpl,
- const u32 *key, const u32 *key_end, const u32 *data,
- u64 timeout, u64 expiration, gfp_t gfp);
+struct nft_elem_priv *nft_set_elem_init(const struct nft_set *set,
+ const struct nft_set_ext_tmpl *tmpl,
+ const u32 *key, const u32 *key_end,
+ const u32 *data,
+ u64 timeout, u64 expiration, gfp_t gfp);
int nft_set_elem_expr_clone(const struct nft_ctx *ctx, struct nft_set *set,
struct nft_expr *expr_array[]);
-void nft_set_elem_destroy(const struct nft_set *set, void *elem,
+void nft_set_elem_destroy(const struct nft_set *set,
+ const struct nft_elem_priv *elem_priv,
bool destroy_expr);
void nf_tables_set_elem_destroy(const struct nft_ctx *ctx,
- const struct nft_set *set, void *elem);
+ const struct nft_set *set,
+ const struct nft_elem_priv *elem_priv);
struct nft_expr_ops;
/**
@@ -1061,7 +1073,7 @@ struct nft_chain {
int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain);
int nft_setelem_validate(const struct nft_ctx *ctx, struct nft_set *set,
const struct nft_set_iter *iter,
- struct nft_set_elem *elem);
+ struct nft_elem_priv *elem_priv);
int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set);
int nf_tables_bind_chain(const struct nft_ctx *ctx, struct nft_chain *chain);
void nf_tables_unbind_chain(const struct nft_ctx *ctx, struct nft_chain *chain);
@@ -1638,14 +1650,14 @@ struct nft_trans_table {
struct nft_trans_elem {
struct nft_set *set;
- struct nft_set_elem elem;
+ struct nft_elem_priv *elem_priv;
bool bound;
};
#define nft_trans_elem_set(trans) \
(((struct nft_trans_elem *)trans->data)->set)
-#define nft_trans_elem(trans) \
- (((struct nft_trans_elem *)trans->data)->elem)
+#define nft_trans_elem_priv(trans) \
+ (((struct nft_trans_elem *)trans->data)->elem_priv)
#define nft_trans_elem_set_bound(trans) \
(((struct nft_trans_elem *)trans->data)->bound)
@@ -1686,7 +1698,7 @@ struct nft_trans_gc {
struct nft_set *set;
u32 seq;
u16 count;
- void *priv[NFT_TRANS_GC_BATCHCOUNT];
+ struct nft_elem_priv *priv[NFT_TRANS_GC_BATCHCOUNT];
struct rcu_head rcu;
};
@@ -1709,7 +1721,7 @@ struct nft_trans_gc *nft_trans_gc_catchall_sync(struct nft_trans_gc *gc);
void nft_setelem_data_deactivate(const struct net *net,
const struct nft_set *set,
- struct nft_set_elem *elem);
+ struct nft_elem_priv *elem_priv);
int __init nft_chain_filter_init(void);
void nft_chain_filter_fini(void);
diff --git a/include/net/netkit.h b/include/net/netkit.h
new file mode 100644
index 000000000000..0ba2e6b847ca
--- /dev/null
+++ b/include/net/netkit.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2023 Isovalent */
+#ifndef __NET_NETKIT_H
+#define __NET_NETKIT_H
+
+#include <linux/bpf.h>
+
+#ifdef CONFIG_NETKIT
+int netkit_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog);
+int netkit_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
+int netkit_prog_detach(const union bpf_attr *attr, struct bpf_prog *prog);
+int netkit_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr);
+#else
+static inline int netkit_prog_attach(const union bpf_attr *attr,
+ struct bpf_prog *prog)
+{
+ return -EINVAL;
+}
+
+static inline int netkit_link_attach(const union bpf_attr *attr,
+ struct bpf_prog *prog)
+{
+ return -EINVAL;
+}
+
+static inline int netkit_prog_detach(const union bpf_attr *attr,
+ struct bpf_prog *prog)
+{
+ return -EINVAL;
+}
+
+static inline int netkit_prog_query(const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ return -EINVAL;
+}
+#endif /* CONFIG_NETKIT */
+#endif /* __NET_NETKIT_H */
diff --git a/include/net/netlink.h b/include/net/netlink.h
index 8a7cd1170e1f..83bdf787aeee 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -128,6 +128,8 @@
* nla_len(nla) length of attribute payload
*
* Attribute Payload Access for Basic Types:
+ * nla_get_uint(nla) get payload for a uint attribute
+ * nla_get_sint(nla) get payload for a sint attribute
* nla_get_u8(nla) get payload for a u8 attribute
* nla_get_u16(nla) get payload for a u16 attribute
* nla_get_u32(nla) get payload for a u32 attribute
@@ -183,6 +185,8 @@ enum {
NLA_REJECT,
NLA_BE16,
NLA_BE32,
+ NLA_SINT,
+ NLA_UINT,
__NLA_TYPE_MAX,
};
@@ -229,6 +233,7 @@ enum nla_policy_validation {
* nested header (or empty); len field is used if
* nested_policy is also used, for the max attr
* number in the nested policy.
+ * NLA_SINT, NLA_UINT,
* NLA_U8, NLA_U16,
* NLA_U32, NLA_U64,
* NLA_S8, NLA_S16,
@@ -260,12 +265,14 @@ enum nla_policy_validation {
* while an array has the nested attributes at another
* level down and the attribute types directly in the
* nesting don't matter.
+ * NLA_UINT,
* NLA_U8,
* NLA_U16,
* NLA_U32,
* NLA_U64,
* NLA_BE16,
* NLA_BE32,
+ * NLA_SINT,
* NLA_S8,
* NLA_S16,
* NLA_S32,
@@ -280,6 +287,7 @@ enum nla_policy_validation {
* or NLA_POLICY_FULL_RANGE_SIGNED() macros instead.
* Use the NLA_POLICY_MIN(), NLA_POLICY_MAX() and
* NLA_POLICY_RANGE() macros.
+ * NLA_UINT,
* NLA_U8,
* NLA_U16,
* NLA_U32,
@@ -288,6 +296,7 @@ enum nla_policy_validation {
* to a struct netlink_range_validation that indicates
* the min/max values.
* Use NLA_POLICY_FULL_RANGE().
+ * NLA_SINT,
* NLA_S8,
* NLA_S16,
* NLA_S32,
@@ -351,8 +360,8 @@ struct nla_policy {
const u32 mask;
const char *reject_message;
const struct nla_policy *nested_policy;
- struct netlink_range_validation *range;
- struct netlink_range_validation_signed *range_signed;
+ const struct netlink_range_validation *range;
+ const struct netlink_range_validation_signed *range_signed;
struct {
s16 min, max;
};
@@ -377,9 +386,11 @@ struct nla_policy {
#define __NLA_IS_UINT_TYPE(tp) \
(tp == NLA_U8 || tp == NLA_U16 || tp == NLA_U32 || \
- tp == NLA_U64 || tp == NLA_BE16 || tp == NLA_BE32)
+ tp == NLA_U64 || tp == NLA_UINT || \
+ tp == NLA_BE16 || tp == NLA_BE32)
#define __NLA_IS_SINT_TYPE(tp) \
- (tp == NLA_S8 || tp == NLA_S16 || tp == NLA_S32 || tp == NLA_S64)
+ (tp == NLA_S8 || tp == NLA_S16 || tp == NLA_S32 || tp == NLA_S64 || \
+ tp == NLA_SINT)
#define __NLA_ENSURE(condition) BUILD_BUG_ON_ZERO(!(condition))
#define NLA_ENSURE_UINT_TYPE(tp) \
@@ -1358,6 +1369,22 @@ static inline int nla_put_u32(struct sk_buff *skb, int attrtype, u32 value)
}
/**
+ * nla_put_uint - Add a variable-size unsigned int to a socket buffer
+ * @skb: socket buffer to add attribute to
+ * @attrtype: attribute type
+ * @value: numeric value
+ */
+static inline int nla_put_uint(struct sk_buff *skb, int attrtype, u64 value)
+{
+ u64 tmp64 = value;
+ u32 tmp32 = value;
+
+ if (tmp64 == tmp32)
+ return nla_put_u32(skb, attrtype, tmp32);
+ return nla_put(skb, attrtype, sizeof(u64), &tmp64);
+}
+
+/**
* nla_put_be32 - Add a __be32 netlink attribute to a socket buffer
* @skb: socket buffer to add attribute to
* @attrtype: attribute type
@@ -1512,6 +1539,22 @@ static inline int nla_put_s64(struct sk_buff *skb, int attrtype, s64 value,
}
/**
+ * nla_put_sint - Add a variable-size signed int to a socket buffer
+ * @skb: socket buffer to add attribute to
+ * @attrtype: attribute type
+ * @value: numeric value
+ */
+static inline int nla_put_sint(struct sk_buff *skb, int attrtype, s64 value)
+{
+ s64 tmp64 = value;
+ s32 tmp32 = value;
+
+ if (tmp64 == tmp32)
+ return nla_put_s32(skb, attrtype, tmp32);
+ return nla_put(skb, attrtype, sizeof(s64), &tmp64);
+}
+
+/**
* nla_put_string - Add a string netlink attribute to a socket buffer
* @skb: socket buffer to add attribute to
* @attrtype: attribute type
@@ -1668,6 +1711,17 @@ static inline u64 nla_get_u64(const struct nlattr *nla)
}
/**
+ * nla_get_uint - return payload of uint attribute
+ * @nla: uint netlink attribute
+ */
+static inline u64 nla_get_uint(const struct nlattr *nla)
+{
+ if (nla_len(nla) == sizeof(u32))
+ return nla_get_u32(nla);
+ return nla_get_u64(nla);
+}
+
+/**
* nla_get_be64 - return payload of __be64 attribute
* @nla: __be64 netlink attribute
*/
@@ -1730,6 +1784,17 @@ static inline s64 nla_get_s64(const struct nlattr *nla)
}
/**
+ * nla_get_sint - return payload of uint attribute
+ * @nla: uint netlink attribute
+ */
+static inline s64 nla_get_sint(const struct nlattr *nla)
+{
+ if (nla_len(nla) == sizeof(s32))
+ return nla_get_s32(nla);
+ return nla_get_s64(nla);
+}
+
+/**
* nla_get_flag - return payload of flag attribute
* @nla: flag netlink attribute
*/
diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
index 1f463b3957c7..bae914815aa3 100644
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -107,7 +107,7 @@ struct netns_ct {
struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb;
struct nf_ip_net nf_ct_proto;
#if defined(CONFIG_NF_CONNTRACK_LABELS)
- unsigned int labels_used;
+ atomic_t labels_used;
#endif
};
#endif
diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h
index bd7c3be4af5d..423b52eca908 100644
--- a/include/net/netns/xfrm.h
+++ b/include/net/netns/xfrm.h
@@ -50,6 +50,7 @@ struct netns_xfrm {
struct list_head policy_all;
struct hlist_head *policy_byidx;
unsigned int policy_idx_hmask;
+ unsigned int idx_generator;
struct hlist_head policy_inexact[XFRM_POLICY_MAX];
struct xfrm_policy_hash policy_bydst[XFRM_POLICY_MAX];
unsigned int policy_count[XFRM_POLICY_MAX * 2];
diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h
index 8f64adf86f5b..4ebd544ae977 100644
--- a/include/net/page_pool/helpers.h
+++ b/include/net/page_pool/helpers.h
@@ -8,23 +8,46 @@
/**
* DOC: page_pool allocator
*
- * The page_pool allocator is optimized for the XDP mode that
- * uses one frame per-page, but it can fallback on the
- * regular page allocator APIs.
- *
- * Basic use involves replacing alloc_pages() calls with the
- * page_pool_alloc_pages() call. Drivers should use
- * page_pool_dev_alloc_pages() replacing dev_alloc_pages().
- *
- * The API keeps track of in-flight pages, in order to let API users know
- * when it is safe to free a page_pool object. Thus, API users
- * must call page_pool_put_page() to free the page, or attach
- * the page to a page_pool-aware object like skbs marked with
+ * The page_pool allocator is optimized for recycling page or page fragment used
+ * by skb packet and xdp frame.
+ *
+ * Basic use involves replacing and alloc_pages() calls with page_pool_alloc(),
+ * which allocate memory with or without page splitting depending on the
+ * requested memory size.
+ *
+ * If the driver knows that it always requires full pages or its allocations are
+ * always smaller than half a page, it can use one of the more specific API
+ * calls:
+ *
+ * 1. page_pool_alloc_pages(): allocate memory without page splitting when
+ * driver knows that the memory it need is always bigger than half of the page
+ * allocated from page pool. There is no cache line dirtying for 'struct page'
+ * when a page is recycled back to the page pool.
+ *
+ * 2. page_pool_alloc_frag(): allocate memory with page splitting when driver
+ * knows that the memory it need is always smaller than or equal to half of the
+ * page allocated from page pool. Page splitting enables memory saving and thus
+ * avoids TLB/cache miss for data access, but there also is some cost to
+ * implement page splitting, mainly some cache line dirtying/bouncing for
+ * 'struct page' and atomic operation for page->pp_frag_count.
+ *
+ * The API keeps track of in-flight pages, in order to let API users know when
+ * it is safe to free a page_pool object, the API users must call
+ * page_pool_put_page() or page_pool_free_va() to free the page_pool object, or
+ * attach the page_pool object to a page_pool-aware object like skbs marked with
* skb_mark_for_recycle().
*
- * API users must call page_pool_put_page() once on a page, as it
- * will either recycle the page, or in case of refcnt > 1, it will
- * release the DMA mapping and in-flight state accounting.
+ * page_pool_put_page() may be called multi times on the same page if a page is
+ * split into multi fragments. For the last fragment, it will either recycle the
+ * page, or in case of page->_refcount > 1, it will release the DMA mapping and
+ * in-flight state accounting.
+ *
+ * dma_sync_single_range_for_device() is only called for the last fragment when
+ * page_pool is created with PP_FLAG_DMA_SYNC_DEV flag, so it depends on the
+ * last freed fragment to do the sync_for_device operation for all fragments in
+ * the same page when a page is split, the API user must setup pool->p.max_len
+ * and pool->p.offset correctly and ensure that page_pool_put_page() is called
+ * with dma_sync_size being -1 for fragment API.
*/
#ifndef _NET_PAGE_POOL_HELPERS_H
#define _NET_PAGE_POOL_HELPERS_H
@@ -73,6 +96,17 @@ static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool)
return page_pool_alloc_pages(pool, gfp);
}
+/**
+ * page_pool_dev_alloc_frag() - allocate a page fragment.
+ * @pool: pool from which to allocate
+ * @offset: offset to the allocated page
+ * @size: requested size
+ *
+ * Get a page fragment from the page allocator or page_pool caches.
+ *
+ * Return:
+ * Return allocated page fragment, otherwise return NULL.
+ */
static inline struct page *page_pool_dev_alloc_frag(struct page_pool *pool,
unsigned int *offset,
unsigned int size)
@@ -82,6 +116,91 @@ static inline struct page *page_pool_dev_alloc_frag(struct page_pool *pool,
return page_pool_alloc_frag(pool, offset, size, gfp);
}
+static inline struct page *page_pool_alloc(struct page_pool *pool,
+ unsigned int *offset,
+ unsigned int *size, gfp_t gfp)
+{
+ unsigned int max_size = PAGE_SIZE << pool->p.order;
+ struct page *page;
+
+ if ((*size << 1) > max_size) {
+ *size = max_size;
+ *offset = 0;
+ return page_pool_alloc_pages(pool, gfp);
+ }
+
+ page = page_pool_alloc_frag(pool, offset, *size, gfp);
+ if (unlikely(!page))
+ return NULL;
+
+ /* There is very likely not enough space for another fragment, so append
+ * the remaining size to the current fragment to avoid truesize
+ * underestimate problem.
+ */
+ if (pool->frag_offset + *size > max_size) {
+ *size = max_size - *offset;
+ pool->frag_offset = max_size;
+ }
+
+ return page;
+}
+
+/**
+ * page_pool_dev_alloc() - allocate a page or a page fragment.
+ * @pool: pool from which to allocate
+ * @offset: offset to the allocated page
+ * @size: in as the requested size, out as the allocated size
+ *
+ * Get a page or a page fragment from the page allocator or page_pool caches
+ * depending on the requested size in order to allocate memory with least memory
+ * utilization and performance penalty.
+ *
+ * Return:
+ * Return allocated page or page fragment, otherwise return NULL.
+ */
+static inline struct page *page_pool_dev_alloc(struct page_pool *pool,
+ unsigned int *offset,
+ unsigned int *size)
+{
+ gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN);
+
+ return page_pool_alloc(pool, offset, size, gfp);
+}
+
+static inline void *page_pool_alloc_va(struct page_pool *pool,
+ unsigned int *size, gfp_t gfp)
+{
+ unsigned int offset;
+ struct page *page;
+
+ /* Mask off __GFP_HIGHMEM to ensure we can use page_address() */
+ page = page_pool_alloc(pool, &offset, size, gfp & ~__GFP_HIGHMEM);
+ if (unlikely(!page))
+ return NULL;
+
+ return page_address(page) + offset;
+}
+
+/**
+ * page_pool_dev_alloc_va() - allocate a page or a page fragment and return its
+ * va.
+ * @pool: pool from which to allocate
+ * @size: in as the requested size, out as the allocated size
+ *
+ * This is just a thin wrapper around the page_pool_alloc() API, and
+ * it returns va of the allocated page or page fragment.
+ *
+ * Return:
+ * Return the va for the allocated page or page fragment, otherwise return NULL.
+ */
+static inline void *page_pool_dev_alloc_va(struct page_pool *pool,
+ unsigned int *size)
+{
+ gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN);
+
+ return page_pool_alloc_va(pool, size, gfp);
+}
+
/**
* page_pool_get_dma_dir() - Retrieve the stored DMA direction.
* @pool: pool from which page was allocated
@@ -115,28 +234,49 @@ static inline long page_pool_defrag_page(struct page *page, long nr)
long ret;
/* If nr == pp_frag_count then we have cleared all remaining
- * references to the page. No need to actually overwrite it, instead
- * we can leave this to be overwritten by the calling function.
+ * references to the page:
+ * 1. 'n == 1': no need to actually overwrite it.
+ * 2. 'n != 1': overwrite it with one, which is the rare case
+ * for pp_frag_count draining.
*
- * The main advantage to doing this is that an atomic_read is
- * generally a much cheaper operation than an atomic update,
- * especially when dealing with a page that may be partitioned
- * into only 2 or 3 pieces.
+ * The main advantage to doing this is that not only we avoid a atomic
+ * update, as an atomic_read is generally a much cheaper operation than
+ * an atomic update, especially when dealing with a page that may be
+ * partitioned into only 2 or 3 pieces; but also unify the pp_frag_count
+ * handling by ensuring all pages have partitioned into only 1 piece
+ * initially, and only overwrite it when the page is partitioned into
+ * more than one piece.
*/
- if (atomic_long_read(&page->pp_frag_count) == nr)
+ if (atomic_long_read(&page->pp_frag_count) == nr) {
+ /* As we have ensured nr is always one for constant case using
+ * the BUILD_BUG_ON(), only need to handle the non-constant case
+ * here for pp_frag_count draining, which is a rare case.
+ */
+ BUILD_BUG_ON(__builtin_constant_p(nr) && nr != 1);
+ if (!__builtin_constant_p(nr))
+ atomic_long_set(&page->pp_frag_count, 1);
+
return 0;
+ }
ret = atomic_long_sub_return(nr, &page->pp_frag_count);
WARN_ON(ret < 0);
+
+ /* We are the last user here too, reset pp_frag_count back to 1 to
+ * ensure all pages have been partitioned into 1 piece initially,
+ * this should be the rare case when the last two fragment users call
+ * page_pool_defrag_page() currently.
+ */
+ if (unlikely(!ret))
+ atomic_long_set(&page->pp_frag_count, 1);
+
return ret;
}
-static inline bool page_pool_is_last_frag(struct page_pool *pool,
- struct page *page)
+static inline bool page_pool_is_last_frag(struct page *page)
{
- /* If fragments aren't enabled or count is 0 we were the last user */
- return !(pool->p.flags & PP_FLAG_PAGE_FRAG) ||
- (page_pool_defrag_page(page, 1) == 0);
+ /* If page_pool_defrag_page() returns 0, we were the last user */
+ return page_pool_defrag_page(page, 1) == 0;
}
/**
@@ -161,7 +301,7 @@ static inline void page_pool_put_page(struct page_pool *pool,
* allow registering MEM_TYPE_PAGE_POOL, but shield linker.
*/
#ifdef CONFIG_PAGE_POOL
- if (!page_pool_is_last_frag(pool, page))
+ if (!page_pool_is_last_frag(page))
return;
page_pool_put_defragged_page(pool, page, dma_sync_size, allow_direct);
@@ -201,6 +341,20 @@ static inline void page_pool_recycle_direct(struct page_pool *pool,
(sizeof(dma_addr_t) > sizeof(unsigned long))
/**
+ * page_pool_free_va() - free a va into the page_pool
+ * @pool: pool from which va was allocated
+ * @va: va to be freed
+ * @allow_direct: freed by the consumer, allow lockless caching
+ *
+ * Free a va allocated from page_pool_allo_va().
+ */
+static inline void page_pool_free_va(struct page_pool *pool, void *va,
+ bool allow_direct)
+{
+ page_pool_put_page(pool, virt_to_head_page(va), -1, allow_direct);
+}
+
+/**
* page_pool_get_dma_addr() - Retrieve the stored DMA address.
* @page: page allocated from a page pool
*
diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h
index 887e7946a597..6fc5134095ed 100644
--- a/include/net/page_pool/types.h
+++ b/include/net/page_pool/types.h
@@ -17,10 +17,8 @@
* Please note DMA-sync-for-CPU is still
* device driver responsibility
*/
-#define PP_FLAG_PAGE_FRAG BIT(2) /* for page frag feature */
#define PP_FLAG_ALL (PP_FLAG_DMA_MAP |\
- PP_FLAG_DMA_SYNC_DEV |\
- PP_FLAG_PAGE_FRAG)
+ PP_FLAG_DMA_SYNC_DEV)
/*
* Fast allocation side cache array/stack
@@ -45,7 +43,7 @@ struct pp_alloc_cache {
/**
* struct page_pool_params - page pool parameters
- * @flags: PP_FLAG_DMA_MAP, PP_FLAG_DMA_SYNC_DEV, PP_FLAG_PAGE_FRAG
+ * @flags: PP_FLAG_DMA_MAP, PP_FLAG_DMA_SYNC_DEV
* @order: 2^order pages on allocation
* @pool_size: size of the ptr_ring
* @nid: NUMA node id to allocate from pages from
diff --git a/include/net/sock.h b/include/net/sock.h
index 01f0005cb7d8..242590308d64 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -336,7 +336,7 @@ struct sk_filter;
* @sk_cgrp_data: cgroup data for this cgroup
* @sk_memcg: this socket's memory cgroup association
* @sk_write_pending: a write to stream socket waits to start
- * @sk_wait_pending: number of threads blocked on this socket
+ * @sk_disconnects: number of disconnect operations performed on this sock
* @sk_state_change: callback to indicate change in the state of the sock
* @sk_data_ready: callback to indicate there is data to be processed
* @sk_write_space: callback to indicate there is bf sending space available
@@ -429,7 +429,7 @@ struct sock {
unsigned int sk_napi_id;
#endif
int sk_rcvbuf;
- int sk_wait_pending;
+ int sk_disconnects;
struct sk_filter __rcu *sk_filter;
union {
@@ -1189,8 +1189,7 @@ static inline void sock_rps_reset_rxhash(struct sock *sk)
}
#define sk_wait_event(__sk, __timeo, __condition, __wait) \
- ({ int __rc; \
- __sk->sk_wait_pending++; \
+ ({ int __rc, __dis = __sk->sk_disconnects; \
release_sock(__sk); \
__rc = __condition; \
if (!__rc) { \
@@ -1200,8 +1199,7 @@ static inline void sock_rps_reset_rxhash(struct sock *sk)
} \
sched_annotate_sleep(); \
lock_sock(__sk); \
- __sk->sk_wait_pending--; \
- __rc = __condition; \
+ __rc = __dis == __sk->sk_disconnects ? __condition : -EPIPE; \
__rc; \
})
diff --git a/include/net/tc_act/tc_ct.h b/include/net/tc_act/tc_ct.h
index b24ea2d9400b..8a6dbfb23336 100644
--- a/include/net/tc_act/tc_ct.h
+++ b/include/net/tc_act/tc_ct.h
@@ -22,6 +22,7 @@ struct tcf_ct_params {
struct nf_nat_range2 range;
bool ipv4_range;
+ bool put_labels;
u16 ct_action;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 32146088a095..993b7fcd4e46 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -143,6 +143,9 @@ static_assert((1 << ATO_BITS) > TCP_DELACK_MAX);
#define TCP_RTO_MAX ((unsigned)(120*HZ))
#define TCP_RTO_MIN ((unsigned)(HZ/5))
#define TCP_TIMEOUT_MIN (2U) /* Min timeout for TCP timers in jiffies */
+
+#define TCP_TIMEOUT_MIN_US (2*USEC_PER_MSEC) /* Min TCP timeout in microsecs */
+
#define TCP_TIMEOUT_INIT ((unsigned)(1*HZ)) /* RFC6298 2.1 initial RTO value */
#define TCP_TIMEOUT_FALLBACK ((unsigned)(3*HZ)) /* RFC 1122 initial RTO value, now
* used as a fallback RTO for the
@@ -163,7 +166,12 @@ static_assert((1 << ATO_BITS) > TCP_DELACK_MAX);
#define MAX_TCP_KEEPCNT 127
#define MAX_TCP_SYNCNT 127
-#define TCP_PAWS_24DAYS (60 * 60 * 24 * 24)
+/* Ensure that TCP PAWS checks are relaxed after ~2147 seconds
+ * to avoid overflows. This assumes a clock smaller than 1 Mhz.
+ * Default clock is 1 Khz, tcp_usec_ts uses 1 Mhz.
+ */
+#define TCP_PAWS_WRAP (INT_MAX / USEC_PER_SEC)
+
#define TCP_PAWS_MSL 60 /* Per-host timestamps are invalidated
* after this time. It should be equal
* (or greater than) TCP_TIMEWAIT_LEN
@@ -795,22 +803,31 @@ static inline u64 tcp_clock_us(void)
return div_u64(tcp_clock_ns(), NSEC_PER_USEC);
}
-/* This should only be used in contexts where tp->tcp_mstamp is up to date */
-static inline u32 tcp_time_stamp(const struct tcp_sock *tp)
+static inline u64 tcp_clock_ms(void)
+{
+ return div_u64(tcp_clock_ns(), NSEC_PER_MSEC);
+}
+
+/* TCP Timestamp included in TS option (RFC 1323) can either use ms
+ * or usec resolution. Each socket carries a flag to select one or other
+ * resolution, as the route attribute could change anytime.
+ * Each flow must stick to initial resolution.
+ */
+static inline u32 tcp_clock_ts(bool usec_ts)
{
- return div_u64(tp->tcp_mstamp, USEC_PER_SEC / TCP_TS_HZ);
+ return usec_ts ? tcp_clock_us() : tcp_clock_ms();
}
-/* Convert a nsec timestamp into TCP TSval timestamp (ms based currently) */
-static inline u32 tcp_ns_to_ts(u64 ns)
+static inline u32 tcp_time_stamp_ms(const struct tcp_sock *tp)
{
- return div_u64(ns, NSEC_PER_SEC / TCP_TS_HZ);
+ return div_u64(tp->tcp_mstamp, USEC_PER_MSEC);
}
-/* Could use tcp_clock_us() / 1000, but this version uses a single divide */
-static inline u32 tcp_time_stamp_raw(void)
+static inline u32 tcp_time_stamp_ts(const struct tcp_sock *tp)
{
- return tcp_ns_to_ts(tcp_clock_ns());
+ if (tp->tcp_usec_ts)
+ return tp->tcp_mstamp;
+ return tcp_time_stamp_ms(tp);
}
void tcp_mstamp_refresh(struct tcp_sock *tp);
@@ -820,17 +837,30 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0)
return max_t(s64, t1 - t0, 0);
}
-static inline u32 tcp_skb_timestamp(const struct sk_buff *skb)
-{
- return tcp_ns_to_ts(skb->skb_mstamp_ns);
-}
-
/* provide the departure time in us unit */
static inline u64 tcp_skb_timestamp_us(const struct sk_buff *skb)
{
return div_u64(skb->skb_mstamp_ns, NSEC_PER_USEC);
}
+/* Provide skb TSval in usec or ms unit */
+static inline u32 tcp_skb_timestamp_ts(bool usec_ts, const struct sk_buff *skb)
+{
+ if (usec_ts)
+ return tcp_skb_timestamp_us(skb);
+
+ return div_u64(skb->skb_mstamp_ns, NSEC_PER_MSEC);
+}
+
+static inline u32 tcp_tw_tsval(const struct tcp_timewait_sock *tcptw)
+{
+ return tcp_clock_ts(tcptw->tw_sk.tw_usec_ts) + tcptw->tw_ts_offset;
+}
+
+static inline u32 tcp_rsk_tsval(const struct tcp_request_sock *treq)
+{
+ return tcp_clock_ts(treq->req_usec_ts) + treq->ts_off;
+}
#define tcp_flag_byte(th) (((u_int8_t *)th)[13])
@@ -1459,13 +1489,15 @@ static inline int tcp_space_from_win(const struct sock *sk, int win)
return __tcp_space_from_win(tcp_sk(sk)->scaling_ratio, win);
}
+/* Assume a conservative default of 1200 bytes of payload per 4K page.
+ * This may be adjusted later in tcp_measure_rcv_mss().
+ */
+#define TCP_DEFAULT_SCALING_RATIO ((1200 << TCP_RMEM_TO_WIN_SCALE) / \
+ SKB_TRUESIZE(4096))
+
static inline void tcp_scaling_ratio_init(struct sock *sk)
{
- /* Assume a conservative default of 1200 bytes of payload per 4K page.
- * This may be adjusted later in tcp_measure_rcv_mss().
- */
- tcp_sk(sk)->scaling_ratio = (1200 << TCP_RMEM_TO_WIN_SCALE) /
- SKB_TRUESIZE(4096);
+ tcp_sk(sk)->scaling_ratio = TCP_DEFAULT_SCALING_RATIO;
}
/* Note: caller must be prepared to deal with negative returns */
@@ -1596,7 +1628,7 @@ static inline bool tcp_paws_check(const struct tcp_options_received *rx_opt,
if ((s32)(rx_opt->ts_recent - rx_opt->rcv_tsval) <= paws_win)
return true;
if (unlikely(!time_before32(ktime_get_seconds(),
- rx_opt->ts_recent_stamp + TCP_PAWS_24DAYS)))
+ rx_opt->ts_recent_stamp + TCP_PAWS_WRAP)))
return true;
/*
* Some OSes send SYN and SYNACK messages with tsval=0 tsecr=0,
diff --git a/include/net/tcx.h b/include/net/tcx.h
index 264f147953ba..04be9377785d 100644
--- a/include/net/tcx.h
+++ b/include/net/tcx.h
@@ -38,16 +38,11 @@ static inline struct tcx_entry *tcx_entry(struct bpf_mprog_entry *entry)
return container_of(bundle, struct tcx_entry, bundle);
}
-static inline struct tcx_link *tcx_link(struct bpf_link *link)
+static inline struct tcx_link *tcx_link(const struct bpf_link *link)
{
return container_of(link, struct tcx_link, link);
}
-static inline const struct tcx_link *tcx_link_const(const struct bpf_link *link)
-{
- return tcx_link((struct bpf_link *)link);
-}
-
void tcx_inc(void);
void tcx_dec(void);
diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
index 4d0578fab01a..d716214fe03d 100644
--- a/include/net/udp_tunnel.h
+++ b/include/net/udp_tunnel.h
@@ -169,6 +169,14 @@ struct rtable *udp_tunnel_dst_lookup(struct sk_buff *skb,
const struct ip_tunnel_key *key,
__be16 sport, __be16 dport, u8 tos,
struct dst_cache *dst_cache);
+struct dst_entry *udp_tunnel6_dst_lookup(struct sk_buff *skb,
+ struct net_device *dev,
+ struct net *net,
+ struct socket *sock, int oif,
+ struct in6_addr *saddr,
+ const struct ip_tunnel_key *key,
+ __be16 sport, __be16 dport, u8 dsfield,
+ struct dst_cache *dst_cache);
struct metadata_dst *udp_tun_rx_dst(struct sk_buff *skb, unsigned short family,
__be16 flags, __be64 tunnel_id,
diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index 69b472604b86..f83128007fb0 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -63,6 +63,13 @@ struct xdp_sock {
struct xsk_queue *tx ____cacheline_aligned_in_smp;
struct list_head tx_list;
+ /* record the number of tx descriptors sent by this xsk and
+ * when it exceeds MAX_PER_SOCKET_BUDGET, an opportunity needs
+ * to be given to other xsks for sending tx descriptors, thereby
+ * preventing other XSKs from being starved.
+ */
+ u32 tx_budget_spent;
+
/* Protects generic receive. */
spinlock_t rx_lock;
@@ -109,4 +116,13 @@ static inline void __xsk_map_flush(void)
#endif /* CONFIG_XDP_SOCKETS */
+#if defined(CONFIG_XDP_SOCKETS) && defined(CONFIG_DEBUG_NET)
+bool xsk_map_check_flush(void);
+#else
+static inline bool xsk_map_check_flush(void)
+{
+ return false;
+}
+#endif
+
#endif /* _LINUX_XDP_SOCK_H */