summaryrefslogtreecommitdiffstats
path: root/net/core
diff options
context:
space:
mode:
Diffstat (limited to 'net/core')
-rw-r--r--net/core/datagram.c14
-rw-r--r--net/core/dev.c33
-rw-r--r--net/core/filter.c39
-rw-r--r--net/core/flow_offload.c6
-rw-r--r--net/core/net-sysfs.c1
-rw-r--r--net/core/secure_seq.c4
-rw-r--r--net/core/skbuff.c37
-rw-r--r--net/core/skmsg.c5
-rw-r--r--net/core/sock_reuseport.c4
9 files changed, 110 insertions, 33 deletions
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 50f4faeea76c..ecbc0f471089 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -613,10 +613,16 @@ fault:
}
EXPORT_SYMBOL(skb_copy_datagram_from_iter);
-int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb,
- struct iov_iter *from, size_t length)
+int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk,
+ struct sk_buff *skb, struct iov_iter *from,
+ size_t length)
{
- int frag = skb_shinfo(skb)->nr_frags;
+ int frag;
+
+ if (msg && msg->msg_ubuf && msg->sg_from_iter)
+ return msg->sg_from_iter(sk, skb, from, length);
+
+ frag = skb_shinfo(skb)->nr_frags;
while (length && iov_iter_count(from)) {
struct page *pages[MAX_SKB_FRAGS];
@@ -702,7 +708,7 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
if (skb_copy_datagram_from_iter(skb, 0, from, copy))
return -EFAULT;
- return __zerocopy_sg_from_iter(NULL, skb, from, ~0U);
+ return __zerocopy_sg_from_iter(NULL, NULL, skb, from, ~0U);
}
EXPORT_SYMBOL(zerocopy_sg_from_iter);
diff --git a/net/core/dev.c b/net/core/dev.c
index 08ce317fcec8..30a1603a7225 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -397,16 +397,18 @@ static void list_netdevice(struct net_device *dev)
/* Device list removal
* caller must respect a RCU grace period before freeing/reusing dev
*/
-static void unlist_netdevice(struct net_device *dev)
+static void unlist_netdevice(struct net_device *dev, bool lock)
{
ASSERT_RTNL();
/* Unlink dev from the device chain */
- write_lock(&dev_base_lock);
+ if (lock)
+ write_lock(&dev_base_lock);
list_del_rcu(&dev->dev_list);
netdev_name_node_del(dev->name_node);
hlist_del_rcu(&dev->index_hlist);
- write_unlock(&dev_base_lock);
+ if (lock)
+ write_unlock(&dev_base_lock);
dev_base_seq_inc(dev_net(dev));
}
@@ -4861,7 +4863,10 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
}
/* When doing generic XDP we have to bypass the qdisc layer and the
- * network taps in order to match in-driver-XDP behavior.
+ * network taps in order to match in-driver-XDP behavior. This also means
+ * that XDP packets are able to starve other packets going through a qdisc,
+ * and DDOS attacks will be more effective. In-driver-XDP use dedicated TX
+ * queues, so they do not have this starvation issue.
*/
void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
{
@@ -4873,7 +4878,7 @@ void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
txq = netdev_core_pick_tx(dev, skb, NULL);
cpu = smp_processor_id();
HARD_TX_LOCK(dev, txq, cpu);
- if (!netif_xmit_stopped(txq)) {
+ if (!netif_xmit_frozen_or_drv_stopped(txq)) {
rc = netdev_start_xmit(skb, dev, txq, 0);
if (dev_xmit_complete(rc))
free_skb = false;
@@ -4881,6 +4886,7 @@ void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
HARD_TX_UNLOCK(dev, txq);
if (free_skb) {
trace_xdp_exception(dev, xdp_prog, XDP_TX);
+ dev_core_stats_tx_dropped_inc(dev);
kfree_skb(skb);
}
}
@@ -10043,11 +10049,11 @@ int register_netdevice(struct net_device *dev)
goto err_uninit;
ret = netdev_register_kobject(dev);
- if (ret) {
- dev->reg_state = NETREG_UNREGISTERED;
+ write_lock(&dev_base_lock);
+ dev->reg_state = ret ? NETREG_UNREGISTERED : NETREG_REGISTERED;
+ write_unlock(&dev_base_lock);
+ if (ret)
goto err_uninit;
- }
- dev->reg_state = NETREG_REGISTERED;
__netdev_update_features(dev);
@@ -10329,7 +10335,9 @@ void netdev_run_todo(void)
continue;
}
+ write_lock(&dev_base_lock);
dev->reg_state = NETREG_UNREGISTERED;
+ write_unlock(&dev_base_lock);
linkwatch_forget_dev(dev);
}
@@ -10810,9 +10818,10 @@ void unregister_netdevice_many(struct list_head *head)
list_for_each_entry(dev, head, unreg_list) {
/* And unlink it from device chain. */
- unlist_netdevice(dev);
-
+ write_lock(&dev_base_lock);
+ unlist_netdevice(dev, false);
dev->reg_state = NETREG_UNREGISTERING;
+ write_unlock(&dev_base_lock);
}
flush_all_backlogs();
@@ -10959,7 +10968,7 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net,
dev_close(dev);
/* And unlink it from device chain */
- unlist_netdevice(dev);
+ unlist_netdevice(dev, true);
synchronize_net();
diff --git a/net/core/filter.c b/net/core/filter.c
index 5af58eb48587..7950f7520765 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -6158,7 +6158,6 @@ static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len
if (err)
return err;
- ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
skb_set_transport_header(skb, sizeof(struct ipv6hdr));
return seg6_lookup_nexthop(skb, NULL, 0);
@@ -6516,10 +6515,21 @@ __bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
ifindex, proto, netns_id, flags);
if (sk) {
- sk = sk_to_full_sk(sk);
- if (!sk_fullsock(sk)) {
+ struct sock *sk2 = sk_to_full_sk(sk);
+
+ /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk
+ * sock refcnt is decremented to prevent a request_sock leak.
+ */
+ if (!sk_fullsock(sk2))
+ sk2 = NULL;
+ if (sk2 != sk) {
sock_gen_put(sk);
- return NULL;
+ /* Ensure there is no need to bump sk2 refcnt */
+ if (unlikely(sk2 && !sock_flag(sk2, SOCK_RCU_FREE))) {
+ WARN_ONCE(1, "Found non-RCU, unreferenced socket!");
+ return NULL;
+ }
+ sk = sk2;
}
}
@@ -6553,10 +6563,21 @@ bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
flags);
if (sk) {
- sk = sk_to_full_sk(sk);
- if (!sk_fullsock(sk)) {
+ struct sock *sk2 = sk_to_full_sk(sk);
+
+ /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk
+ * sock refcnt is decremented to prevent a request_sock leak.
+ */
+ if (!sk_fullsock(sk2))
+ sk2 = NULL;
+ if (sk2 != sk) {
sock_gen_put(sk);
- return NULL;
+ /* Ensure there is no need to bump sk2 refcnt */
+ if (unlikely(sk2 && !sock_flag(sk2, SOCK_RCU_FREE))) {
+ WARN_ONCE(1, "Found non-RCU, unreferenced socket!");
+ return NULL;
+ }
+ sk = sk2;
}
}
@@ -7020,7 +7041,7 @@ BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len
if (sk->sk_protocol != IPPROTO_TCP || sk->sk_state != TCP_LISTEN)
return -EINVAL;
- if (!sock_net(sk)->ipv4.sysctl_tcp_syncookies)
+ if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies))
return -EINVAL;
if (!th->ack || th->rst || th->syn)
@@ -7095,7 +7116,7 @@ BPF_CALL_5(bpf_tcp_gen_syncookie, struct sock *, sk, void *, iph, u32, iph_len,
if (sk->sk_protocol != IPPROTO_TCP || sk->sk_state != TCP_LISTEN)
return -EINVAL;
- if (!sock_net(sk)->ipv4.sysctl_tcp_syncookies)
+ if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies))
return -ENOENT;
if (!th->syn || th->ack || th->fin || th->rst)
diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c
index 73f68d4625f3..929f6379a279 100644
--- a/net/core/flow_offload.c
+++ b/net/core/flow_offload.c
@@ -595,3 +595,9 @@ int flow_indr_dev_setup_offload(struct net_device *dev, struct Qdisc *sch,
return (bo && list_empty(&bo->cb_list)) ? -EOPNOTSUPP : count;
}
EXPORT_SYMBOL(flow_indr_dev_setup_offload);
+
+bool flow_indr_dev_exists(void)
+{
+ return !list_empty(&flow_block_indr_dev_list);
+}
+EXPORT_SYMBOL(flow_indr_dev_exists);
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index e319e242dddf..a3642569fe53 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -33,6 +33,7 @@ static const char fmt_dec[] = "%d\n";
static const char fmt_ulong[] = "%lu\n";
static const char fmt_u64[] = "%llu\n";
+/* Caller holds RTNL or dev_base_lock */
static inline int dev_isalive(const struct net_device *dev)
{
return dev->reg_state <= NETREG_REGISTERED;
diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c
index 5f85e01d4093..b0ff6153be62 100644
--- a/net/core/secure_seq.c
+++ b/net/core/secure_seq.c
@@ -64,7 +64,7 @@ u32 secure_tcpv6_ts_off(const struct net *net,
.daddr = *(struct in6_addr *)daddr,
};
- if (net->ipv4.sysctl_tcp_timestamps != 1)
+ if (READ_ONCE(net->ipv4.sysctl_tcp_timestamps) != 1)
return 0;
ts_secret_init();
@@ -120,7 +120,7 @@ EXPORT_SYMBOL(secure_ipv6_port_ephemeral);
#ifdef CONFIG_INET
u32 secure_tcp_ts_off(const struct net *net, __be32 saddr, __be32 daddr)
{
- if (net->ipv4.sysctl_tcp_timestamps != 1)
+ if (READ_ONCE(net->ipv4.sysctl_tcp_timestamps) != 1)
return 0;
ts_secret_init();
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 5b3559cb1d82..cf4107d80bc4 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -666,11 +666,18 @@ static void skb_release_data(struct sk_buff *skb)
&shinfo->dataref))
goto exit;
- skb_zcopy_clear(skb, true);
+ if (skb_zcopy(skb)) {
+ bool skip_unref = shinfo->flags & SKBFL_MANAGED_FRAG_REFS;
+
+ skb_zcopy_clear(skb, true);
+ if (skip_unref)
+ goto free_head;
+ }
for (i = 0; i < shinfo->nr_frags; i++)
__skb_frag_unref(&shinfo->frags[i], skb->pp_recycle);
+free_head:
if (shinfo->frag_list)
kfree_skb_list(shinfo->frag_list);
@@ -895,7 +902,10 @@ EXPORT_SYMBOL(skb_dump);
*/
void skb_tx_error(struct sk_buff *skb)
{
- skb_zcopy_clear(skb, true);
+ if (skb) {
+ skb_zcopy_downgrade_managed(skb);
+ skb_zcopy_clear(skb, true);
+ }
}
EXPORT_SYMBOL(skb_tx_error);
@@ -1193,7 +1203,7 @@ static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size)
uarg->len = 1;
uarg->bytelen = size;
uarg->zerocopy = 1;
- uarg->flags = SKBFL_ZEROCOPY_FRAG;
+ uarg->flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN;
refcount_set(&uarg->refcnt, 1);
sock_hold(sk);
@@ -1212,6 +1222,10 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
const u32 byte_limit = 1 << 19; /* limit to a few TSO */
u32 bytelen, next;
+ /* there might be non MSG_ZEROCOPY users */
+ if (uarg->callback != msg_zerocopy_callback)
+ return NULL;
+
/* realloc only when socket is locked (TCP, UDP cork),
* so uarg->len and sk_zckey access is serialized
*/
@@ -1354,7 +1368,7 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
if (orig_uarg && uarg != orig_uarg)
return -EEXIST;
- err = __zerocopy_sg_from_iter(sk, skb, &msg->msg_iter, len);
+ err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len);
if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) {
struct sock *save_sk = skb->sk;
@@ -1371,6 +1385,16 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
}
EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream);
+void __skb_zcopy_downgrade_managed(struct sk_buff *skb)
+{
+ int i;
+
+ skb_shinfo(skb)->flags &= ~SKBFL_MANAGED_FRAG_REFS;
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
+ skb_frag_ref(skb, i);
+}
+EXPORT_SYMBOL_GPL(__skb_zcopy_downgrade_managed);
+
static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig,
gfp_t gfp_mask)
{
@@ -1688,6 +1712,8 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
BUG_ON(skb_shared(skb));
+ skb_zcopy_downgrade_managed(skb);
+
size = SKB_DATA_ALIGN(size);
if (skb_pfmemalloc(skb))
@@ -3484,6 +3510,8 @@ void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
int pos = skb_headlen(skb);
const int zc_flags = SKBFL_SHARED_FRAG | SKBFL_PURE_ZEROCOPY;
+ skb_zcopy_downgrade_managed(skb);
+
skb_shinfo(skb1)->flags |= skb_shinfo(skb)->flags & zc_flags;
skb_zerocopy_clone(skb1, skb, 0);
if (len < pos) /* Split line is inside header. */
@@ -3837,6 +3865,7 @@ int skb_append_pagefrags(struct sk_buff *skb, struct page *page,
if (skb_can_coalesce(skb, i, page, offset)) {
skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size);
} else if (i < MAX_SKB_FRAGS) {
+ skb_zcopy_downgrade_managed(skb);
get_page(page);
skb_fill_page_desc(skb, i, page, offset, size);
} else {
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 22b983ade0e7..b0fcd0200e84 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -699,6 +699,11 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node)
write_lock_bh(&sk->sk_callback_lock);
+ if (sk_is_inet(sk) && inet_csk_has_ulp(sk)) {
+ psock = ERR_PTR(-EINVAL);
+ goto out;
+ }
+
if (sk->sk_user_data) {
psock = ERR_PTR(-EBUSY);
goto out;
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index 3f00a28fe762..5daa1fa54249 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -387,7 +387,7 @@ void reuseport_stop_listen_sock(struct sock *sk)
prog = rcu_dereference_protected(reuse->prog,
lockdep_is_held(&reuseport_lock));
- if (sock_net(sk)->ipv4.sysctl_tcp_migrate_req ||
+ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_migrate_req) ||
(prog && prog->expected_attach_type == BPF_SK_REUSEPORT_SELECT_OR_MIGRATE)) {
/* Migration capable, move sk from the listening section
* to the closed section.
@@ -545,7 +545,7 @@ struct sock *reuseport_migrate_sock(struct sock *sk,
hash = migrating_sk->sk_hash;
prog = rcu_dereference(reuse->prog);
if (!prog || prog->expected_attach_type != BPF_SK_REUSEPORT_SELECT_OR_MIGRATE) {
- if (sock_net(sk)->ipv4.sysctl_tcp_migrate_req)
+ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_migrate_req))
goto select_by_hash;
goto failure;
}