summaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig3
-rw-r--r--net/ipv4/af_inet.c63
-rw-r--r--net/ipv4/arp.c204
-rw-r--r--net/ipv4/bpf_tcp_ca.c12
-rw-r--r--net/ipv4/cipso_ipv4.c84
-rw-r--r--net/ipv4/devinet.c95
-rw-r--r--net/ipv4/esp4.c27
-rw-r--r--net/ipv4/esp4_offload.c24
-rw-r--r--net/ipv4/fib_frontend.c6
-rw-r--r--net/ipv4/fib_rules.c54
-rw-r--r--net/ipv4/fib_semantics.c26
-rw-r--r--net/ipv4/fib_trie.c4
-rw-r--r--net/ipv4/fou_bpf.c2
-rw-r--r--net/ipv4/fou_core.c31
-rw-r--r--net/ipv4/gre_demux.c2
-rw-r--r--net/ipv4/icmp.c161
-rw-r--r--net/ipv4/igmp.c3
-rw-r--r--net/ipv4/inet_connection_sock.c89
-rw-r--r--net/ipv4/inet_diag.c6
-rw-r--r--net/ipv4/inet_fragment.c6
-rw-r--r--net/ipv4/inet_hashtables.c15
-rw-r--r--net/ipv4/inet_timewait_sock.c79
-rw-r--r--net/ipv4/ip_fragment.c6
-rw-r--r--net/ipv4/ip_gre.c153
-rw-r--r--net/ipv4/ip_input.c8
-rw-r--r--net/ipv4/ip_output.c29
-rw-r--r--net/ipv4/ip_tunnel.c144
-rw-r--r--net/ipv4/ip_tunnel_core.c82
-rw-r--r--net/ipv4/ip_vti.c43
-rw-r--r--net/ipv4/ipip.c35
-rw-r--r--net/ipv4/ipmr.c14
-rw-r--r--net/ipv4/metrics.c8
-rw-r--r--net/ipv4/netfilter.c3
-rw-r--r--net/ipv4/netfilter/arp_tables.c4
-rw-r--r--net/ipv4/netfilter/ip_tables.c4
-rw-r--r--net/ipv4/netfilter/ipt_rpfilter.c3
-rw-r--r--net/ipv4/netfilter/iptable_filter.c2
-rw-r--r--net/ipv4/netfilter/iptable_nat.c18
-rw-r--r--net/ipv4/netfilter/nf_dup_ipv4.c3
-rw-r--r--net/ipv4/netfilter/nf_tproxy_ipv4.c2
-rw-r--r--net/ipv4/netfilter/nft_dup_ipv4.c4
-rw-r--r--net/ipv4/netfilter/nft_fib_ipv4.c5
-rw-r--r--net/ipv4/nexthop.c60
-rw-r--r--net/ipv4/ping.c2
-rw-r--r--net/ipv4/proc.c3
-rw-r--r--net/ipv4/raw.c9
-rw-r--r--net/ipv4/route.c121
-rw-r--r--net/ipv4/syncookies.c5
-rw-r--r--net/ipv4/sysctl_net_ipv4.c149
-rw-r--r--net/ipv4/tcp.c489
-rw-r--r--net/ipv4/tcp_ao.c89
-rw-r--r--net/ipv4/tcp_bbr.c6
-rw-r--r--net/ipv4/tcp_bpf.c6
-rw-r--r--net/ipv4/tcp_cong.c20
-rw-r--r--net/ipv4/tcp_cubic.c4
-rw-r--r--net/ipv4/tcp_dctcp.c17
-rw-r--r--net/ipv4/tcp_fastopen.c7
-rw-r--r--net/ipv4/tcp_htcp.c2
-rw-r--r--net/ipv4/tcp_input.c240
-rw-r--r--net/ipv4/tcp_ipv4.c160
-rw-r--r--net/ipv4/tcp_metrics.c18
-rw-r--r--net/ipv4/tcp_minisocks.c100
-rw-r--r--net/ipv4/tcp_offload.c241
-rw-r--r--net/ipv4/tcp_output.c177
-rw-r--r--net/ipv4/tcp_sigpool.c17
-rw-r--r--net/ipv4/tcp_timer.c43
-rw-r--r--net/ipv4/udp.c97
-rw-r--r--net/ipv4/udp_offload.c54
-rw-r--r--net/ipv4/udp_tunnel_core.c8
-rw-r--r--net/ipv4/xfrm4_input.c19
-rw-r--r--net/ipv4/xfrm4_policy.c5
71 files changed, 2455 insertions, 1279 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 8e94ed7c56a0..6d2c97f8e9ef 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -661,7 +661,8 @@ config TCP_CONG_CDG
For further details see:
D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using
- delay gradients." In Networking 2011. Preprint: http://goo.gl/No3vdg
+ delay gradients." In Networking 2011. Preprint:
+ http://caia.swin.edu.au/cv/dahayes/content/networking2011-cdg-preprint.pdf
config TCP_CONG_BBR
tristate "BBR TCP"
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 55bd72997b31..b24d74616637 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -758,7 +758,9 @@ void __inet_accept(struct socket *sock, struct socket *newsock, struct sock *new
sock_rps_record_flow(newsk);
WARN_ON(!((1 << newsk->sk_state) &
(TCPF_ESTABLISHED | TCPF_SYN_RECV |
- TCPF_CLOSE_WAIT | TCPF_CLOSE)));
+ TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 |
+ TCPF_CLOSING | TCPF_CLOSE_WAIT |
+ TCPF_CLOSE)));
if (test_bit(SOCK_SUPPORT_ZC, &sock->flags))
set_bit(SOCK_SUPPORT_ZC, &newsock->flags);
@@ -771,16 +773,16 @@ void __inet_accept(struct socket *sock, struct socket *newsock, struct sock *new
* Accept a pending connection. The TCP layer now gives BSD semantics.
*/
-int inet_accept(struct socket *sock, struct socket *newsock, int flags,
- bool kern)
+int inet_accept(struct socket *sock, struct socket *newsock,
+ struct proto_accept_arg *arg)
{
struct sock *sk1 = sock->sk, *sk2;
- int err = -EINVAL;
/* IPV6_ADDRFORM can change sk->sk_prot under us. */
- sk2 = READ_ONCE(sk1->sk_prot)->accept(sk1, flags, &err, kern);
+ arg->err = -EINVAL;
+ sk2 = READ_ONCE(sk1->sk_prot)->accept(sk1, arg);
if (!sk2)
- return err;
+ return arg->err;
lock_sock(sk2);
__inet_accept(sock, newsock, sk2);
@@ -1072,6 +1074,7 @@ const struct proto_ops inet_stream_ops = {
#endif
.splice_eof = inet_splice_eof,
.splice_read = tcp_splice_read,
+ .set_peek_off = sk_set_peek_off,
.read_sock = tcp_read_sock,
.read_skb = tcp_read_skb,
.sendmsg_locked = tcp_sendmsg_locked,
@@ -1306,8 +1309,8 @@ static int inet_sk_reselect_saddr(struct sock *sk)
int inet_sk_rebuild_header(struct sock *sk)
{
+ struct rtable *rt = dst_rtable(__sk_dst_check(sk, 0));
struct inet_sock *inet = inet_sk(sk);
- struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
__be32 daddr;
struct ip_options_rcu *inet_opt;
struct flowi4 *fl4;
@@ -1481,7 +1484,6 @@ struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb)
struct sk_buff *p;
unsigned int hlen;
unsigned int off;
- unsigned int id;
int flush = 1;
int proto;
@@ -1507,13 +1509,10 @@ struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb)
goto out;
NAPI_GRO_CB(skb)->proto = proto;
- id = ntohl(*(__be32 *)&iph->id);
- flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id & ~IP_DF));
- id >>= 16;
+ flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (ntohl(*(__be32 *)&iph->id) & ~IP_DF));
list_for_each_entry(p, head, list) {
struct iphdr *iph2;
- u16 flush_id;
if (!NAPI_GRO_CB(p)->same_flow)
continue;
@@ -1530,48 +1529,10 @@ struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb)
NAPI_GRO_CB(p)->same_flow = 0;
continue;
}
-
- /* All fields must match except length and checksum. */
- NAPI_GRO_CB(p)->flush |=
- (iph->ttl ^ iph2->ttl) |
- (iph->tos ^ iph2->tos) |
- ((iph->frag_off ^ iph2->frag_off) & htons(IP_DF));
-
- NAPI_GRO_CB(p)->flush |= flush;
-
- /* We need to store of the IP ID check to be included later
- * when we can verify that this packet does in fact belong
- * to a given flow.
- */
- flush_id = (u16)(id - ntohs(iph2->id));
-
- /* This bit of code makes it much easier for us to identify
- * the cases where we are doing atomic vs non-atomic IP ID
- * checks. Specifically an atomic check can return IP ID
- * values 0 - 0xFFFF, while a non-atomic check can only
- * return 0 or 0xFFFF.
- */
- if (!NAPI_GRO_CB(p)->is_atomic ||
- !(iph->frag_off & htons(IP_DF))) {
- flush_id ^= NAPI_GRO_CB(p)->count;
- flush_id = flush_id ? 0xFFFF : 0;
- }
-
- /* If the previous IP ID value was based on an atomic
- * datagram we can overwrite the value and ignore it.
- */
- if (NAPI_GRO_CB(skb)->is_atomic)
- NAPI_GRO_CB(p)->flush_id = flush_id;
- else
- NAPI_GRO_CB(p)->flush_id |= flush_id;
}
- NAPI_GRO_CB(skb)->is_atomic = !!(iph->frag_off & htons(IP_DF));
NAPI_GRO_CB(skb)->flush |= flush;
- skb_set_network_header(skb, off);
- /* The above will be needed by the transport layer if there is one
- * immediately following this IP hdr.
- */
+ NAPI_GRO_CB(skb)->network_offsets[NAPI_GRO_CB(skb)->encap_mark] = off;
/* Note : No need to call skb_gro_postpull_rcsum() here,
* as we already checked checksum over ipv4 header was 0
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 0d0d725b46ad..11c1519b3699 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -456,7 +456,8 @@ static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev)
/*unsigned long now; */
struct net *net = dev_net(dev);
- rt = ip_route_output(net, sip, tip, 0, l3mdev_master_ifindex_rcu(dev));
+ rt = ip_route_output(net, sip, tip, 0, l3mdev_master_ifindex_rcu(dev),
+ RT_SCOPE_UNIVERSE);
if (IS_ERR(rt))
return 1;
if (rt->dst.dev != dev) {
@@ -1002,6 +1003,55 @@ out_of_mem:
* User level interface (ioctl)
*/
+static struct net_device *arp_req_dev_by_name(struct net *net, struct arpreq *r,
+ bool getarp)
+{
+ struct net_device *dev;
+
+ if (getarp)
+ dev = dev_get_by_name_rcu(net, r->arp_dev);
+ else
+ dev = __dev_get_by_name(net, r->arp_dev);
+ if (!dev)
+ return ERR_PTR(-ENODEV);
+
+ /* Mmmm... It is wrong... ARPHRD_NETROM == 0 */
+ if (!r->arp_ha.sa_family)
+ r->arp_ha.sa_family = dev->type;
+
+ if ((r->arp_flags & ATF_COM) && r->arp_ha.sa_family != dev->type)
+ return ERR_PTR(-EINVAL);
+
+ return dev;
+}
+
+static struct net_device *arp_req_dev(struct net *net, struct arpreq *r)
+{
+ struct net_device *dev;
+ struct rtable *rt;
+ __be32 ip;
+
+ if (r->arp_dev[0])
+ return arp_req_dev_by_name(net, r, false);
+
+ if (r->arp_flags & ATF_PUBL)
+ return NULL;
+
+ ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
+
+ rt = ip_route_output(net, ip, 0, 0, 0, RT_SCOPE_LINK);
+ if (IS_ERR(rt))
+ return ERR_CAST(rt);
+
+ dev = rt->dst.dev;
+ ip_rt_put(rt);
+
+ if (!dev)
+ return ERR_PTR(-EINVAL);
+
+ return dev;
+}
+
/*
* Set (create) an ARP cache entry.
*/
@@ -1022,11 +1072,8 @@ static int arp_req_set_proxy(struct net *net, struct net_device *dev, int on)
static int arp_req_set_public(struct net *net, struct arpreq *r,
struct net_device *dev)
{
- __be32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
__be32 mask = ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr;
- if (mask && mask != htonl(0xFFFFFFFF))
- return -EINVAL;
if (!dev && (r->arp_flags & ATF_COM)) {
dev = dev_getbyhwaddr_rcu(net, r->arp_ha.sa_family,
r->arp_ha.sa_data);
@@ -1034,6 +1081,8 @@ static int arp_req_set_public(struct net *net, struct arpreq *r,
return -ENODEV;
}
if (mask) {
+ __be32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
+
if (!pneigh_lookup(&arp_tbl, net, &ip, dev, 1))
return -ENOBUFS;
return 0;
@@ -1042,29 +1091,20 @@ static int arp_req_set_public(struct net *net, struct arpreq *r,
return arp_req_set_proxy(net, dev, 1);
}
-static int arp_req_set(struct net *net, struct arpreq *r,
- struct net_device *dev)
+static int arp_req_set(struct net *net, struct arpreq *r)
{
- __be32 ip;
struct neighbour *neigh;
+ struct net_device *dev;
+ __be32 ip;
int err;
+ dev = arp_req_dev(net, r);
+ if (IS_ERR(dev))
+ return PTR_ERR(dev);
+
if (r->arp_flags & ATF_PUBL)
return arp_req_set_public(net, r, dev);
- ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
- if (r->arp_flags & ATF_PERM)
- r->arp_flags |= ATF_COM;
- if (!dev) {
- struct rtable *rt = ip_route_output(net, ip, 0, RTO_ONLINK, 0);
-
- if (IS_ERR(rt))
- return PTR_ERR(rt);
- dev = rt->dst.dev;
- ip_rt_put(rt);
- if (!dev)
- return -EINVAL;
- }
switch (dev->type) {
#if IS_ENABLED(CONFIG_FDDI)
case ARPHRD_FDDI:
@@ -1086,12 +1126,18 @@ static int arp_req_set(struct net *net, struct arpreq *r,
break;
}
+ ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
+
neigh = __neigh_lookup_errno(&arp_tbl, &ip, dev);
err = PTR_ERR(neigh);
if (!IS_ERR(neigh)) {
unsigned int state = NUD_STALE;
- if (r->arp_flags & ATF_PERM)
+
+ if (r->arp_flags & ATF_PERM) {
+ r->arp_flags |= ATF_COM;
state = NUD_PERMANENT;
+ }
+
err = neigh_update(neigh, (r->arp_flags & ATF_COM) ?
r->arp_ha.sa_data : NULL, state,
NEIGH_UPDATE_F_OVERRIDE |
@@ -1115,27 +1161,40 @@ static unsigned int arp_state_to_flags(struct neighbour *neigh)
* Get an ARP cache entry.
*/
-static int arp_req_get(struct arpreq *r, struct net_device *dev)
+static int arp_req_get(struct net *net, struct arpreq *r)
{
__be32 ip = ((struct sockaddr_in *) &r->arp_pa)->sin_addr.s_addr;
struct neighbour *neigh;
- int err = -ENXIO;
+ struct net_device *dev;
+
+ if (!r->arp_dev[0])
+ return -ENODEV;
+
+ dev = arp_req_dev_by_name(net, r, true);
+ if (IS_ERR(dev))
+ return PTR_ERR(dev);
neigh = neigh_lookup(&arp_tbl, &ip, dev);
- if (neigh) {
- if (!(READ_ONCE(neigh->nud_state) & NUD_NOARP)) {
- read_lock_bh(&neigh->lock);
- memcpy(r->arp_ha.sa_data, neigh->ha,
- min(dev->addr_len, sizeof(r->arp_ha.sa_data_min)));
- r->arp_flags = arp_state_to_flags(neigh);
- read_unlock_bh(&neigh->lock);
- r->arp_ha.sa_family = dev->type;
- strscpy(r->arp_dev, dev->name, sizeof(r->arp_dev));
- err = 0;
- }
+ if (!neigh)
+ return -ENXIO;
+
+ if (READ_ONCE(neigh->nud_state) & NUD_NOARP) {
neigh_release(neigh);
+ return -ENXIO;
}
- return err;
+
+ read_lock_bh(&neigh->lock);
+ memcpy(r->arp_ha.sa_data, neigh->ha,
+ min(dev->addr_len, sizeof(r->arp_ha.sa_data_min)));
+ r->arp_flags = arp_state_to_flags(neigh);
+ read_unlock_bh(&neigh->lock);
+
+ neigh_release(neigh);
+
+ r->arp_ha.sa_family = dev->type;
+ netdev_copy_name(dev, r->arp_dev);
+
+ return 0;
}
int arp_invalidate(struct net_device *dev, __be32 ip, bool force)
@@ -1166,36 +1225,31 @@ int arp_invalidate(struct net_device *dev, __be32 ip, bool force)
static int arp_req_delete_public(struct net *net, struct arpreq *r,
struct net_device *dev)
{
- __be32 ip = ((struct sockaddr_in *) &r->arp_pa)->sin_addr.s_addr;
__be32 mask = ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr;
- if (mask == htonl(0xFFFFFFFF))
- return pneigh_delete(&arp_tbl, net, &ip, dev);
+ if (mask) {
+ __be32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
- if (mask)
- return -EINVAL;
+ return pneigh_delete(&arp_tbl, net, &ip, dev);
+ }
return arp_req_set_proxy(net, dev, 0);
}
-static int arp_req_delete(struct net *net, struct arpreq *r,
- struct net_device *dev)
+static int arp_req_delete(struct net *net, struct arpreq *r)
{
+ struct net_device *dev;
__be32 ip;
+ dev = arp_req_dev(net, r);
+ if (IS_ERR(dev))
+ return PTR_ERR(dev);
+
if (r->arp_flags & ATF_PUBL)
return arp_req_delete_public(net, r, dev);
ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
- if (!dev) {
- struct rtable *rt = ip_route_output(net, ip, 0, RTO_ONLINK, 0);
- if (IS_ERR(rt))
- return PTR_ERR(rt);
- dev = rt->dst.dev;
- ip_rt_put(rt);
- if (!dev)
- return -EINVAL;
- }
+
return arp_invalidate(dev, ip, true);
}
@@ -1205,9 +1259,9 @@ static int arp_req_delete(struct net *net, struct arpreq *r,
int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg)
{
- int err;
struct arpreq r;
- struct net_device *dev = NULL;
+ __be32 *netmask;
+ int err;
switch (cmd) {
case SIOCDARP:
@@ -1230,42 +1284,34 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg)
if (!(r.arp_flags & ATF_PUBL) &&
(r.arp_flags & (ATF_NETMASK | ATF_DONTPUB)))
return -EINVAL;
+
+ netmask = &((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr;
if (!(r.arp_flags & ATF_NETMASK))
- ((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr =
- htonl(0xFFFFFFFFUL);
- rtnl_lock();
- if (r.arp_dev[0]) {
- err = -ENODEV;
- dev = __dev_get_by_name(net, r.arp_dev);
- if (!dev)
- goto out;
-
- /* Mmmm... It is wrong... ARPHRD_NETROM==0 */
- if (!r.arp_ha.sa_family)
- r.arp_ha.sa_family = dev->type;
- err = -EINVAL;
- if ((r.arp_flags & ATF_COM) && r.arp_ha.sa_family != dev->type)
- goto out;
- } else if (cmd == SIOCGARP) {
- err = -ENODEV;
- goto out;
- }
+ *netmask = htonl(0xFFFFFFFFUL);
+ else if (*netmask && *netmask != htonl(0xFFFFFFFFUL))
+ return -EINVAL;
switch (cmd) {
case SIOCDARP:
- err = arp_req_delete(net, &r, dev);
+ rtnl_lock();
+ err = arp_req_delete(net, &r);
+ rtnl_unlock();
break;
case SIOCSARP:
- err = arp_req_set(net, &r, dev);
+ rtnl_lock();
+ err = arp_req_set(net, &r);
+ rtnl_unlock();
break;
case SIOCGARP:
- err = arp_req_get(&r, dev);
+ rcu_read_lock();
+ err = arp_req_get(net, &r);
+ rcu_read_unlock();
+
+ if (!err && copy_to_user(arg, &r, sizeof(r)))
+ err = -EFAULT;
break;
}
-out:
- rtnl_unlock();
- if (cmd == SIOCGARP && !err && copy_to_user(arg, &r, sizeof(r)))
- err = -EFAULT;
+
return err;
}
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index 7f518ea5f4ac..3f88d0961e5b 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -107,6 +107,9 @@ static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log,
case offsetof(struct tcp_sock, snd_cwnd_cnt):
end = offsetofend(struct tcp_sock, snd_cwnd_cnt);
break;
+ case offsetof(struct tcp_sock, snd_cwnd_stamp):
+ end = offsetofend(struct tcp_sock, snd_cwnd_stamp);
+ break;
case offsetof(struct tcp_sock, snd_ssthresh):
end = offsetofend(struct tcp_sock, snd_ssthresh);
break;
@@ -257,17 +260,17 @@ static int bpf_tcp_ca_check_member(const struct btf_type *t,
return 0;
}
-static int bpf_tcp_ca_reg(void *kdata)
+static int bpf_tcp_ca_reg(void *kdata, struct bpf_link *link)
{
return tcp_register_congestion_control(kdata);
}
-static void bpf_tcp_ca_unreg(void *kdata)
+static void bpf_tcp_ca_unreg(void *kdata, struct bpf_link *link)
{
tcp_unregister_congestion_control(kdata);
}
-static int bpf_tcp_ca_update(void *kdata, void *old_kdata)
+static int bpf_tcp_ca_update(void *kdata, void *old_kdata, struct bpf_link *link)
{
return tcp_update_congestion_control(kdata, old_kdata);
}
@@ -307,7 +310,8 @@ static u32 bpf_tcp_ca_min_tso_segs(struct sock *sk)
return 0;
}
-static void bpf_tcp_ca_cong_control(struct sock *sk, const struct rate_sample *rs)
+static void bpf_tcp_ca_cong_control(struct sock *sk, u32 ack, int flag,
+ const struct rate_sample *rs)
{
}
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 8b17d83e5fde..8cc0e2f4159d 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -1810,11 +1810,35 @@ static int cipso_v4_genopt(unsigned char *buf, u32 buf_len,
return CIPSO_V4_HDR_LEN + ret_val;
}
+static int cipso_v4_get_actual_opt_len(const unsigned char *data, int len)
+{
+ int iter = 0, optlen = 0;
+
+ /* determining the new total option length is tricky because of
+ * the padding necessary, the only thing i can think to do at
+ * this point is walk the options one-by-one, skipping the
+ * padding at the end to determine the actual option size and
+ * from there we can determine the new total option length
+ */
+ while (iter < len) {
+ if (data[iter] == IPOPT_END) {
+ break;
+ } else if (data[iter] == IPOPT_NOP) {
+ iter++;
+ } else {
+ iter += data[iter + 1];
+ optlen = iter;
+ }
+ }
+ return optlen;
+}
+
/**
* cipso_v4_sock_setattr - Add a CIPSO option to a socket
* @sk: the socket
* @doi_def: the CIPSO DOI to use
* @secattr: the specific security attributes of the socket
+ * @sk_locked: true if caller holds the socket lock
*
* Description:
* Set the CIPSO option on the given socket using the DOI definition and
@@ -1826,7 +1850,8 @@ static int cipso_v4_genopt(unsigned char *buf, u32 buf_len,
*/
int cipso_v4_sock_setattr(struct sock *sk,
const struct cipso_v4_doi *doi_def,
- const struct netlbl_lsm_secattr *secattr)
+ const struct netlbl_lsm_secattr *secattr,
+ bool sk_locked)
{
int ret_val = -EPERM;
unsigned char *buf = NULL;
@@ -1876,8 +1901,7 @@ int cipso_v4_sock_setattr(struct sock *sk,
sk_inet = inet_sk(sk);
- old = rcu_dereference_protected(sk_inet->inet_opt,
- lockdep_sock_is_held(sk));
+ old = rcu_dereference_protected(sk_inet->inet_opt, sk_locked);
if (inet_test_bit(IS_ICSK, sk)) {
sk_conn = inet_csk(sk);
if (old)
@@ -1952,7 +1976,7 @@ int cipso_v4_req_setattr(struct request_sock *req,
buf = NULL;
req_inet = inet_rsk(req);
- opt = xchg((__force struct ip_options_rcu **)&req_inet->ireq_opt, opt);
+ opt = unrcu_pointer(xchg(&req_inet->ireq_opt, RCU_INITIALIZER(opt)));
if (opt)
kfree_rcu(opt, rcu);
@@ -1985,7 +2009,6 @@ static int cipso_v4_delopt(struct ip_options_rcu __rcu **opt_ptr)
u8 cipso_len;
u8 cipso_off;
unsigned char *cipso_ptr;
- int iter;
int optlen_new;
cipso_off = opt->opt.cipso - sizeof(struct iphdr);
@@ -2005,19 +2028,8 @@ static int cipso_v4_delopt(struct ip_options_rcu __rcu **opt_ptr)
memmove(cipso_ptr, cipso_ptr + cipso_len,
opt->opt.optlen - cipso_off - cipso_len);
- /* determining the new total option length is tricky because of
- * the padding necessary, the only thing i can think to do at
- * this point is walk the options one-by-one, skipping the
- * padding at the end to determine the actual option size and
- * from there we can determine the new total option length */
- iter = 0;
- optlen_new = 0;
- while (iter < opt->opt.optlen)
- if (opt->opt.__data[iter] != IPOPT_NOP) {
- iter += opt->opt.__data[iter + 1];
- optlen_new = iter;
- } else
- iter++;
+ optlen_new = cipso_v4_get_actual_opt_len(opt->opt.__data,
+ opt->opt.optlen);
hdr_delta = opt->opt.optlen;
opt->opt.optlen = (optlen_new + 3) & ~3;
hdr_delta -= opt->opt.optlen;
@@ -2237,7 +2249,8 @@ int cipso_v4_skbuff_setattr(struct sk_buff *skb,
*/
int cipso_v4_skbuff_delattr(struct sk_buff *skb)
{
- int ret_val;
+ int ret_val, cipso_len, hdr_len_actual, new_hdr_len_actual, new_hdr_len,
+ hdr_len_delta;
struct iphdr *iph;
struct ip_options *opt = &IPCB(skb)->opt;
unsigned char *cipso_ptr;
@@ -2250,16 +2263,37 @@ int cipso_v4_skbuff_delattr(struct sk_buff *skb)
if (ret_val < 0)
return ret_val;
- /* the easiest thing to do is just replace the cipso option with noop
- * options since we don't change the size of the packet, although we
- * still need to recalculate the checksum */
-
iph = ip_hdr(skb);
cipso_ptr = (unsigned char *)iph + opt->cipso;
- memset(cipso_ptr, IPOPT_NOOP, cipso_ptr[1]);
+ cipso_len = cipso_ptr[1];
+
+ hdr_len_actual = sizeof(struct iphdr) +
+ cipso_v4_get_actual_opt_len((unsigned char *)(iph + 1),
+ opt->optlen);
+ new_hdr_len_actual = hdr_len_actual - cipso_len;
+ new_hdr_len = (new_hdr_len_actual + 3) & ~3;
+ hdr_len_delta = (iph->ihl << 2) - new_hdr_len;
+
+ /* 1. shift any options after CIPSO to the left */
+ memmove(cipso_ptr, cipso_ptr + cipso_len,
+ new_hdr_len_actual - opt->cipso);
+ /* 2. move the whole IP header to its new place */
+ memmove((unsigned char *)iph + hdr_len_delta, iph, new_hdr_len_actual);
+ /* 3. adjust the skb layout */
+ skb_pull(skb, hdr_len_delta);
+ skb_reset_network_header(skb);
+ iph = ip_hdr(skb);
+ /* 4. re-fill new padding with IPOPT_END (may now be longer) */
+ memset((unsigned char *)iph + new_hdr_len_actual, IPOPT_END,
+ new_hdr_len - new_hdr_len_actual);
+
+ opt->optlen -= hdr_len_delta;
opt->cipso = 0;
opt->is_changed = 1;
-
+ if (hdr_len_delta != 0) {
+ iph->ihl = new_hdr_len >> 2;
+ iph_set_totlen(iph, skb->len);
+ }
ip_send_check(iph);
return 0;
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 7a437f0d4190..ab76744383cf 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -216,22 +216,37 @@ static void devinet_sysctl_unregister(struct in_device *idev)
/* Locks all the inet devices. */
-static struct in_ifaddr *inet_alloc_ifa(void)
+static struct in_ifaddr *inet_alloc_ifa(struct in_device *in_dev)
{
- return kzalloc(sizeof(struct in_ifaddr), GFP_KERNEL_ACCOUNT);
+ struct in_ifaddr *ifa;
+
+ ifa = kzalloc(sizeof(*ifa), GFP_KERNEL_ACCOUNT);
+ if (!ifa)
+ return NULL;
+
+ in_dev_hold(in_dev);
+ ifa->ifa_dev = in_dev;
+
+ INIT_HLIST_NODE(&ifa->hash);
+
+ return ifa;
}
static void inet_rcu_free_ifa(struct rcu_head *head)
{
struct in_ifaddr *ifa = container_of(head, struct in_ifaddr, rcu_head);
- if (ifa->ifa_dev)
- in_dev_put(ifa->ifa_dev);
+
+ in_dev_put(ifa->ifa_dev);
kfree(ifa);
}
static void inet_free_ifa(struct in_ifaddr *ifa)
{
- call_rcu(&ifa->rcu_head, inet_rcu_free_ifa);
+ /* Our reference to ifa->ifa_dev must be freed ASAP
+ * to release the reference to the netdev the same way.
+ * in_dev_put() -> in_dev_finish_destroy() -> netdev_put()
+ */
+ call_rcu_hurry(&ifa->rcu_head, inet_rcu_free_ifa);
}
static void in_dev_free_rcu(struct rcu_head *head)
@@ -569,17 +584,9 @@ static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa)
ASSERT_RTNL();
- if (!in_dev) {
- inet_free_ifa(ifa);
- return -ENOBUFS;
- }
ipv4_devconf_setall(in_dev);
neigh_parms_data_state_setall(in_dev->arp_parms);
- if (ifa->ifa_dev != in_dev) {
- WARN_ON(ifa->ifa_dev);
- in_dev_hold(in_dev);
- ifa->ifa_dev = in_dev;
- }
+
if (ipv4_is_loopback(ifa->ifa_local))
ifa->ifa_scope = RT_SCOPE_HOST;
return inet_insert_ifa(ifa);
@@ -696,8 +703,6 @@ errout:
return err;
}
-#define INFINITY_LIFE_TIME 0xFFFFFFFF
-
static void check_lifetime(struct work_struct *work)
{
unsigned long now, next, next_sec, next_sched;
@@ -870,7 +875,7 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh,
if (!in_dev)
goto errout;
- ifa = inet_alloc_ifa();
+ ifa = inet_alloc_ifa(in_dev);
if (!ifa)
/*
* A potential indev allocation can be left alive, it stays
@@ -880,19 +885,15 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh,
ipv4_devconf_setall(in_dev);
neigh_parms_data_state_setall(in_dev->arp_parms);
- in_dev_hold(in_dev);
if (!tb[IFA_ADDRESS])
tb[IFA_ADDRESS] = tb[IFA_LOCAL];
- INIT_HLIST_NODE(&ifa->hash);
ifa->ifa_prefixlen = ifm->ifa_prefixlen;
ifa->ifa_mask = inet_make_mask(ifm->ifa_prefixlen);
ifa->ifa_flags = tb[IFA_FLAGS] ? nla_get_u32(tb[IFA_FLAGS]) :
ifm->ifa_flags;
ifa->ifa_scope = ifm->ifa_scope;
- ifa->ifa_dev = in_dev;
-
ifa->ifa_local = nla_get_in_addr(tb[IFA_LOCAL]);
ifa->ifa_address = nla_get_in_addr(tb[IFA_ADDRESS]);
@@ -1179,10 +1180,12 @@ int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr)
if (!ifa) {
ret = -ENOBUFS;
- ifa = inet_alloc_ifa();
+ if (!in_dev)
+ break;
+ ifa = inet_alloc_ifa(in_dev);
if (!ifa)
break;
- INIT_HLIST_NODE(&ifa->hash);
+
if (colon)
memcpy(ifa->ifa_label, ifr->ifr_name, IFNAMSIZ);
else
@@ -1581,16 +1584,13 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
if (!inetdev_valid_mtu(dev->mtu))
break;
if (dev->flags & IFF_LOOPBACK) {
- struct in_ifaddr *ifa = inet_alloc_ifa();
+ struct in_ifaddr *ifa = inet_alloc_ifa(in_dev);
if (ifa) {
- INIT_HLIST_NODE(&ifa->hash);
ifa->ifa_local =
ifa->ifa_address = htonl(INADDR_LOOPBACK);
ifa->ifa_prefixlen = 8;
ifa->ifa_mask = inet_make_mask(8);
- in_dev_hold(in_dev);
- ifa->ifa_dev = in_dev;
ifa->ifa_scope = RT_SCOPE_HOST;
memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
set_ifa_lifetime(ifa, INFINITY_LIFE_TIME,
@@ -1683,6 +1683,7 @@ static int inet_fill_ifaddr(struct sk_buff *skb, const struct in_ifaddr *ifa,
struct nlmsghdr *nlh;
unsigned long tstamp;
u32 preferred, valid;
+ u32 flags;
nlh = nlmsg_put(skb, args->portid, args->seq, args->event, sizeof(*ifm),
args->flags);
@@ -1692,7 +1693,13 @@ static int inet_fill_ifaddr(struct sk_buff *skb, const struct in_ifaddr *ifa,
ifm = nlmsg_data(nlh);
ifm->ifa_family = AF_INET;
ifm->ifa_prefixlen = ifa->ifa_prefixlen;
- ifm->ifa_flags = READ_ONCE(ifa->ifa_flags);
+
+ flags = READ_ONCE(ifa->ifa_flags);
+ /* Warning : ifm->ifa_flags is an __u8, it holds only 8 bits.
+ * The 32bit value is given in IFA_FLAGS attribute.
+ */
+ ifm->ifa_flags = (__u8)flags;
+
ifm->ifa_scope = ifa->ifa_scope;
ifm->ifa_index = ifa->ifa_dev->dev->ifindex;
@@ -1701,7 +1708,7 @@ static int inet_fill_ifaddr(struct sk_buff *skb, const struct in_ifaddr *ifa,
goto nla_put_failure;
tstamp = READ_ONCE(ifa->ifa_tstamp);
- if (!(ifm->ifa_flags & IFA_F_PERMANENT)) {
+ if (!(flags & IFA_F_PERMANENT)) {
preferred = READ_ONCE(ifa->ifa_preferred_lft);
valid = READ_ONCE(ifa->ifa_valid_lft);
if (preferred != INFINITY_LIFE_TIME) {
@@ -1732,7 +1739,7 @@ static int inet_fill_ifaddr(struct sk_buff *skb, const struct in_ifaddr *ifa,
nla_put_string(skb, IFA_LABEL, ifa->ifa_label)) ||
(ifa->ifa_proto &&
nla_put_u8(skb, IFA_PROTO, ifa->ifa_proto)) ||
- nla_put_u32(skb, IFA_FLAGS, ifm->ifa_flags) ||
+ nla_put_u32(skb, IFA_FLAGS, flags) ||
(ifa->ifa_rt_priority &&
nla_put_u32(skb, IFA_RT_PRIORITY, ifa->ifa_rt_priority)) ||
put_cacheinfo(skb, READ_ONCE(ifa->ifa_cstamp), tstamp,
@@ -1875,10 +1882,11 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
goto done;
if (fillargs.ifindex) {
- err = -ENODEV;
dev = dev_get_by_index_rcu(tgt_net, fillargs.ifindex);
- if (!dev)
+ if (!dev) {
+ err = -ENODEV;
goto done;
+ }
in_dev = __in_dev_get_rcu(dev);
if (!in_dev)
goto done;
@@ -1890,7 +1898,7 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
cb->seq = inet_base_seq(tgt_net);
- for_each_netdev_dump(net, dev, ctx->ifindex) {
+ for_each_netdev_dump(tgt_net, dev, ctx->ifindex) {
in_dev = __in_dev_get_rcu(dev);
if (!in_dev)
continue;
@@ -1935,8 +1943,7 @@ static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh,
rtnl_notify(skb, net, portid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL);
return;
errout:
- if (err < 0)
- rtnl_set_sk_err(net, RTNLGRP_IPV4_IFADDR, err);
+ rtnl_set_sk_err(net, RTNLGRP_IPV4_IFADDR, err);
}
static size_t inet_get_link_af_size(const struct net_device *dev,
@@ -2132,8 +2139,7 @@ void inet_netconf_notify_devconf(struct net *net, int event, int type,
rtnl_notify(skb, net, 0, RTNLGRP_IPV4_NETCONF, NULL, GFP_KERNEL);
return;
errout:
- if (err < 0)
- rtnl_set_sk_err(net, RTNLGRP_IPV4_NETCONF, err);
+ rtnl_set_sk_err(net, RTNLGRP_IPV4_NETCONF, err);
}
static const struct nla_policy devconf_ipv4_policy[NETCONFA_MAX+1] = {
@@ -2377,7 +2383,7 @@ static int devinet_conf_ifindex(struct net *net, struct ipv4_devconf *cnf)
}
}
-static int devinet_conf_proc(struct ctl_table *ctl, int write,
+static int devinet_conf_proc(const struct ctl_table *ctl, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int old_value = *(int *)ctl->data;
@@ -2429,7 +2435,7 @@ static int devinet_conf_proc(struct ctl_table *ctl, int write,
return ret;
}
-static int devinet_sysctl_forward(struct ctl_table *ctl, int write,
+static int devinet_sysctl_forward(const struct ctl_table *ctl, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int *valp = ctl->data;
@@ -2476,7 +2482,7 @@ static int devinet_sysctl_forward(struct ctl_table *ctl, int write,
return ret;
}
-static int ipv4_doint_and_flush(struct ctl_table *ctl, int write,
+static int ipv4_doint_and_flush(const struct ctl_table *ctl, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int *valp = ctl->data;
@@ -2515,7 +2521,7 @@ static int ipv4_doint_and_flush(struct ctl_table *ctl, int write,
static struct devinet_sysctl_table {
struct ctl_table_header *sysctl_header;
- struct ctl_table devinet_vars[__IPV4_DEVCONF_MAX];
+ struct ctl_table devinet_vars[IPV4_DEVCONF_MAX];
} devinet_sysctl = {
.devinet_vars = {
DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding",
@@ -2578,7 +2584,7 @@ static int __devinet_sysctl_register(struct net *net, char *dev_name,
if (!t)
goto out;
- for (i = 0; i < ARRAY_SIZE(t->devinet_vars) - 1; i++) {
+ for (i = 0; i < ARRAY_SIZE(t->devinet_vars); i++) {
t->devinet_vars[i].data += (char *)p - (char *)&ipv4_devconf;
t->devinet_vars[i].extra1 = p;
t->devinet_vars[i].extra2 = net;
@@ -2652,7 +2658,6 @@ static struct ctl_table ctl_forward_entry[] = {
.extra1 = &ipv4_devconf,
.extra2 = &init_net,
},
- { },
};
#endif
@@ -2749,7 +2754,7 @@ err_alloc_all:
static __net_exit void devinet_exit_net(struct net *net)
{
#ifdef CONFIG_SYSCTL
- struct ctl_table *tbl;
+ const struct ctl_table *tbl;
tbl = net->ipv4.forw_hdr->ctl_table_arg;
unregister_net_sysctl_table(net->ipv4.forw_hdr);
@@ -2793,7 +2798,7 @@ void __init devinet_init(void)
rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL, 0);
rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL, 0);
rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr,
- RTNL_FLAG_DUMP_UNLOCKED);
+ RTNL_FLAG_DUMP_UNLOCKED | RTNL_FLAG_DUMP_SPLIT_NLM_DONE);
rtnl_register(PF_INET, RTM_GETNETCONF, inet_netconf_get_devconf,
inet_netconf_dump_devconf,
RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED);
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index d33d12421814..f3281312eb5e 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -20,6 +20,7 @@
#include <net/udp.h>
#include <net/tcp.h>
#include <net/espintcp.h>
+#include <linux/skbuff_ref.h>
#include <linux/highmem.h>
@@ -114,7 +115,8 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb)
*/
if (req->src != req->dst)
for (sg = sg_next(req->src); sg; sg = sg_next(sg))
- skb_page_unref(skb, sg_page(sg), false);
+ skb_page_unref(page_to_netmem(sg_page(sg)),
+ skb->pp_recycle);
}
#ifdef CONFIG_INET_ESPINTCP
@@ -238,8 +240,7 @@ static int esp_output_tail_tcp(struct xfrm_state *x, struct sk_buff *skb)
#else
static int esp_output_tail_tcp(struct xfrm_state *x, struct sk_buff *skb)
{
- kfree_skb(skb);
-
+ WARN_ON(1);
return -EOPNOTSUPP;
}
#endif
@@ -347,8 +348,8 @@ static struct ip_esp_hdr *esp_output_udp_encap(struct sk_buff *skb,
__be16 dport)
{
struct udphdr *uh;
- __be32 *udpdata32;
unsigned int len;
+ struct xfrm_offload *xo = xfrm_offload(skb);
len = skb->len + esp->tailen - skb_transport_offset(skb);
if (len + sizeof(struct iphdr) > IP_MAX_MTU)
@@ -360,13 +361,12 @@ static struct ip_esp_hdr *esp_output_udp_encap(struct sk_buff *skb,
uh->len = htons(len);
uh->check = 0;
- *skb_mac_header(skb) = IPPROTO_UDP;
-
- if (encap_type == UDP_ENCAP_ESPINUDP_NON_IKE) {
- udpdata32 = (__be32 *)(uh + 1);
- udpdata32[0] = udpdata32[1] = 0;
- return (struct ip_esp_hdr *)(udpdata32 + 2);
- }
+ /* For IPv4 ESP with UDP encapsulation, if xo is not null, the skb is in the crypto offload
+ * data path, which means that esp_output_udp_encap is called outside of the XFRM stack.
+ * In this case, the mac header doesn't point to the IPv4 protocol field, so don't set it.
+ */
+ if (!xo || encap_type != UDP_ENCAP_ESPINUDP)
+ *skb_mac_header(skb) = IPPROTO_UDP;
return (struct ip_esp_hdr *)(uh + 1);
}
@@ -423,7 +423,6 @@ static int esp_output_encap(struct xfrm_state *x, struct sk_buff *skb,
switch (encap_type) {
default:
case UDP_ENCAP_ESPINUDP:
- case UDP_ENCAP_ESPINUDP_NON_IKE:
esph = esp_output_udp_encap(skb, encap_type, esp, sport, dport);
break;
case TCP_ENCAP_ESPINTCP:
@@ -775,7 +774,6 @@ int esp_input_done2(struct sk_buff *skb, int err)
source = th->source;
break;
case UDP_ENCAP_ESPINUDP:
- case UDP_ENCAP_ESPINUDP_NON_IKE:
source = uh->source;
break;
default:
@@ -1179,9 +1177,6 @@ static int esp_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack)
case UDP_ENCAP_ESPINUDP:
x->props.header_len += sizeof(struct udphdr);
break;
- case UDP_ENCAP_ESPINUDP_NON_IKE:
- x->props.header_len += sizeof(struct udphdr) + 2 * sizeof(u32);
- break;
#ifdef CONFIG_INET_ESPINTCP
case TCP_ENCAP_ESPINTCP:
/* only the length field, TCP encap is done by
diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c
index b3271957ad9a..80c4ea0e12f4 100644
--- a/net/ipv4/esp4_offload.c
+++ b/net/ipv4/esp4_offload.c
@@ -56,6 +56,13 @@ static struct sk_buff *esp4_gro_receive(struct list_head *head,
x = xfrm_state_lookup(dev_net(skb->dev), skb->mark,
(xfrm_address_t *)&ip_hdr(skb)->daddr,
spi, IPPROTO_ESP, AF_INET);
+
+ if (unlikely(x && x->dir && x->dir != XFRM_SA_DIR_IN)) {
+ /* non-offload path will record the error and audit log */
+ xfrm_state_put(x);
+ x = NULL;
+ }
+
if (!x)
goto out_reset;
@@ -264,6 +271,7 @@ static int esp_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features_
struct esp_info esp;
bool hw_offload = true;
__u32 seq;
+ int encap_type = 0;
esp.inplace = true;
@@ -296,8 +304,10 @@ static int esp_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features_
esp.esph = ip_esp_hdr(skb);
+ if (x->encap)
+ encap_type = x->encap->encap_type;
- if (!hw_offload || !skb_is_gso(skb)) {
+ if (!hw_offload || !skb_is_gso(skb) || (hw_offload && encap_type == UDP_ENCAP_ESPINUDP)) {
esp.nfrags = esp_output_head(x, skb, &esp);
if (esp.nfrags < 0)
return esp.nfrags;
@@ -324,6 +334,18 @@ static int esp_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features_
esp.seqno = cpu_to_be64(seq + ((u64)xo->seq.hi << 32));
+ if (hw_offload && encap_type == UDP_ENCAP_ESPINUDP) {
+ /* In the XFRM stack, the encapsulation protocol is set to iphdr->protocol by
+ * setting *skb_mac_header(skb) (see esp_output_udp_encap()) where skb->mac_header
+ * points to iphdr->protocol (see xfrm4_tunnel_encap_add()).
+ * However, in esp_xmit(), skb->mac_header doesn't point to iphdr->protocol.
+ * Therefore, the protocol field needs to be corrected.
+ */
+ ip_hdr(skb)->protocol = IPPROTO_UDP;
+
+ esph->seq_no = htonl(seq);
+ }
+
ip_hdr(skb)->tot_len = htons(skb->len);
ip_send_check(ip_hdr(skb));
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 48741352a88a..793e6781399a 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -293,7 +293,7 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb)
.flowi4_iif = LOOPBACK_IFINDEX,
.flowi4_l3mdev = l3mdev_master_ifindex_rcu(dev),
.daddr = ip_hdr(skb)->saddr,
- .flowi4_tos = ip_hdr(skb)->tos & IPTOS_RT_MASK,
+ .flowi4_tos = ip_hdr(skb)->tos & INET_DSCP_MASK,
.flowi4_scope = scope,
.flowi4_mark = vmark ? skb->mark : 0,
};
@@ -1343,7 +1343,7 @@ static void nl_fib_lookup(struct net *net, struct fib_result_nl *frn)
struct flowi4 fl4 = {
.flowi4_mark = frn->fl_mark,
.daddr = frn->fl_addr,
- .flowi4_tos = frn->fl_tos,
+ .flowi4_tos = frn->fl_tos & INET_DSCP_MASK,
.flowi4_scope = frn->fl_scope,
};
struct fib_table *tb;
@@ -1660,5 +1660,5 @@ void __init ip_fib_init(void)
rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, 0);
rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, 0);
rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib,
- RTNL_FLAG_DUMP_UNLOCKED);
+ RTNL_FLAG_DUMP_UNLOCKED | RTNL_FLAG_DUMP_SPLIT_NLM_DONE);
}
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 5bdd1c016009..b07292d50ee7 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -37,6 +37,7 @@ struct fib4_rule {
u8 dst_len;
u8 src_len;
dscp_t dscp;
+ u8 dscp_full:1; /* DSCP or TOS selector */
__be32 src;
__be32 srcmask;
__be32 dst;
@@ -186,7 +187,15 @@ INDIRECT_CALLABLE_SCOPE int fib4_rule_match(struct fib_rule *rule,
((daddr ^ r->dst) & r->dstmask))
return 0;
- if (r->dscp && r->dscp != inet_dsfield_to_dscp(fl4->flowi4_tos))
+ /* When DSCP selector is used we need to match on the entire DSCP field
+ * in the flow information structure. When TOS selector is used we need
+ * to mask the upper three DSCP bits prior to matching to maintain
+ * legacy behavior.
+ */
+ if (r->dscp_full && r->dscp != inet_dsfield_to_dscp(fl4->flowi4_tos))
+ return 0;
+ else if (!r->dscp_full && r->dscp &&
+ !fib_dscp_masked_match(r->dscp, fl4))
return 0;
if (rule->ip_proto && (rule->ip_proto != fl4->flowi4_proto))
@@ -217,6 +226,20 @@ static struct fib_table *fib_empty_table(struct net *net)
return NULL;
}
+static int fib4_nl2rule_dscp(const struct nlattr *nla, struct fib4_rule *rule4,
+ struct netlink_ext_ack *extack)
+{
+ if (rule4->dscp) {
+ NL_SET_ERR_MSG(extack, "Cannot specify both TOS and DSCP");
+ return -EINVAL;
+ }
+
+ rule4->dscp = inet_dsfield_to_dscp(nla_get_u8(nla) << 2);
+ rule4->dscp_full = true;
+
+ return 0;
+}
+
static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
struct fib_rule_hdr *frh,
struct nlattr **tb,
@@ -238,6 +261,10 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
}
rule4->dscp = inet_dsfield_to_dscp(frh->tos);
+ if (tb[FRA_DSCP] &&
+ fib4_nl2rule_dscp(tb[FRA_DSCP], rule4, extack) < 0)
+ goto errout;
+
/* split local/main if they are not already split */
err = fib_unmerge(net);
if (err)
@@ -320,9 +347,19 @@ static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
if (frh->dst_len && (rule4->dst_len != frh->dst_len))
return 0;
- if (frh->tos && inet_dscp_to_dsfield(rule4->dscp) != frh->tos)
+ if (frh->tos &&
+ (rule4->dscp_full ||
+ inet_dscp_to_dsfield(rule4->dscp) != frh->tos))
return 0;
+ if (tb[FRA_DSCP]) {
+ dscp_t dscp;
+
+ dscp = inet_dsfield_to_dscp(nla_get_u8(tb[FRA_DSCP]) << 2);
+ if (!rule4->dscp_full || rule4->dscp != dscp)
+ return 0;
+ }
+
#ifdef CONFIG_IP_ROUTE_CLASSID
if (tb[FRA_FLOW] && (rule4->tclassid != nla_get_u32(tb[FRA_FLOW])))
return 0;
@@ -344,7 +381,15 @@ static int fib4_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
frh->dst_len = rule4->dst_len;
frh->src_len = rule4->src_len;
- frh->tos = inet_dscp_to_dsfield(rule4->dscp);
+
+ if (rule4->dscp_full) {
+ frh->tos = 0;
+ if (nla_put_u8(skb, FRA_DSCP,
+ inet_dscp_to_dsfield(rule4->dscp) >> 2))
+ goto nla_put_failure;
+ } else {
+ frh->tos = inet_dscp_to_dsfield(rule4->dscp);
+ }
if ((rule4->dst_len &&
nla_put_in_addr(skb, FRA_DST, rule4->dst)) ||
@@ -366,7 +411,8 @@ static size_t fib4_rule_nlmsg_payload(struct fib_rule *rule)
{
return nla_total_size(4) /* dst */
+ nla_total_size(4) /* src */
- + nla_total_size(4); /* flow */
+ + nla_total_size(4) /* flow */
+ + nla_total_size(1); /* dscp */
}
static void fib4_rule_flush_cache(struct fib_rules_ops *ops)
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 5eb1b8d302bb..ba2df3d2ac15 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -254,7 +254,7 @@ void free_fib_info(struct fib_info *fi)
return;
}
- call_rcu(&fi->rcu, free_fib_info_rcu);
+ call_rcu_hurry(&fi->rcu, free_fib_info_rcu);
}
EXPORT_SYMBOL_GPL(free_fib_info);
@@ -543,8 +543,7 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
info->nlh, GFP_KERNEL);
return;
errout:
- if (err < 0)
- rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err);
+ rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err);
}
static int fib_detect_death(struct fib_info *fi, int order,
@@ -1030,7 +1029,7 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi)
bool ecn_ca = false;
nla_strscpy(tmp, nla, sizeof(tmp));
- val = tcp_ca_get_key_by_name(fi->fib_net, tmp, &ecn_ca);
+ val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
} else {
if (nla_len(nla) != sizeof(u32))
return false;
@@ -1459,8 +1458,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
fi = kzalloc(struct_size(fi, fib_nh, nhs), GFP_KERNEL);
if (!fi)
goto failure;
- fi->fib_metrics = ip_fib_metrics_init(fi->fib_net, cfg->fc_mx,
- cfg->fc_mx_len, extack);
+ fi->fib_metrics = ip_fib_metrics_init(cfg->fc_mx, cfg->fc_mx_len, extack);
if (IS_ERR(fi->fib_metrics)) {
err = PTR_ERR(fi->fib_metrics);
kfree(fi);
@@ -2067,8 +2065,7 @@ static void fib_select_default(const struct flowi4 *flp, struct fib_result *res)
if (fa->fa_slen != slen)
continue;
- if (fa->fa_dscp &&
- fa->fa_dscp != inet_dsfield_to_dscp(flp->flowi4_tos))
+ if (fa->fa_dscp && !fib_dscp_masked_match(fa->fa_dscp, flp))
continue;
if (fa->tb_id != tb->tb_id)
continue;
@@ -2270,6 +2267,15 @@ void fib_select_path(struct net *net, struct fib_result *res,
fib_select_default(fl4, res);
check_saddr:
- if (!fl4->saddr)
- fl4->saddr = fib_result_prefsrc(net, res);
+ if (!fl4->saddr) {
+ struct net_device *l3mdev;
+
+ l3mdev = dev_get_by_index_rcu(net, fl4->flowi4_l3mdev);
+
+ if (!l3mdev ||
+ l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) == l3mdev)
+ fl4->saddr = fib_result_prefsrc(net, res);
+ else
+ fl4->saddr = inet_select_addr(l3mdev, 0, RT_SCOPE_LINK);
+ }
}
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index f474106464d2..09e31757e96c 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1580,8 +1580,7 @@ found:
if (index >= (1ul << fa->fa_slen))
continue;
}
- if (fa->fa_dscp &&
- inet_dscp_to_dsfield(fa->fa_dscp) != flp->flowi4_tos)
+ if (fa->fa_dscp && !fib_dscp_masked_match(fa->fa_dscp, flp))
continue;
/* Paired with WRITE_ONCE() in fib_release_info() */
if (READ_ONCE(fi->fib_dead))
@@ -1629,6 +1628,7 @@ set_result:
res->nhc = nhc;
res->type = fa->fa_type;
res->scope = fi->fib_scope;
+ res->dscp = fa->fa_dscp;
res->fi = fi;
res->table = tb;
res->fa_head = &n->leaf;
diff --git a/net/ipv4/fou_bpf.c b/net/ipv4/fou_bpf.c
index 06e5572f296f..54984f3170a8 100644
--- a/net/ipv4/fou_bpf.c
+++ b/net/ipv4/fou_bpf.c
@@ -64,7 +64,7 @@ __bpf_kfunc int bpf_skb_set_fou_encap(struct __sk_buff *skb_ctx,
info->encap.type = TUNNEL_ENCAP_NONE;
}
- if (info->key.tun_flags & TUNNEL_CSUM)
+ if (test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags))
info->encap.flags |= TUNNEL_ENCAP_FLAG_CSUM;
info->encap.sport = encap->sport;
diff --git a/net/ipv4/fou_core.c b/net/ipv4/fou_core.c
index a8494f796dca..3e30745e2c09 100644
--- a/net/ipv4/fou_core.c
+++ b/net/ipv4/fou_core.c
@@ -50,7 +50,7 @@ struct fou_net {
static inline struct fou *fou_from_sock(struct sock *sk)
{
- return sk->sk_user_data;
+ return rcu_dereference_sk_user_data(sk);
}
static int fou_recv_pull(struct sk_buff *skb, struct fou *fou, size_t len)
@@ -233,9 +233,15 @@ static struct sk_buff *fou_gro_receive(struct sock *sk,
struct sk_buff *skb)
{
const struct net_offload __rcu **offloads;
- u8 proto = fou_from_sock(sk)->protocol;
+ struct fou *fou = fou_from_sock(sk);
const struct net_offload *ops;
struct sk_buff *pp = NULL;
+ u8 proto;
+
+ if (!fou)
+ goto out;
+
+ proto = fou->protocol;
/* We can clear the encap_mark for FOU as we are essentially doing
* one of two possible things. We are either adding an L4 tunnel
@@ -263,14 +269,24 @@ static int fou_gro_complete(struct sock *sk, struct sk_buff *skb,
int nhoff)
{
const struct net_offload __rcu **offloads;
- u8 proto = fou_from_sock(sk)->protocol;
+ struct fou *fou = fou_from_sock(sk);
const struct net_offload *ops;
- int err = -ENOSYS;
+ u8 proto;
+ int err;
+
+ if (!fou) {
+ err = -ENOENT;
+ goto out;
+ }
+
+ proto = fou->protocol;
offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
ops = rcu_dereference(offloads[proto]);
- if (WARN_ON(!ops || !ops->callbacks.gro_complete))
+ if (WARN_ON(!ops || !ops->callbacks.gro_complete)) {
+ err = -ENOSYS;
goto out;
+ }
err = ops->callbacks.gro_complete(skb, nhoff);
@@ -322,6 +338,9 @@ static struct sk_buff *gue_gro_receive(struct sock *sk,
skb_gro_remcsum_init(&grc);
+ if (!fou)
+ goto out;
+
off = skb_gro_offset(skb);
len = off + sizeof(*guehdr);
@@ -433,7 +452,7 @@ next_proto:
offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
ops = rcu_dereference(offloads[proto]);
- if (WARN_ON_ONCE(!ops || !ops->callbacks.gro_receive))
+ if (!ops || !ops->callbacks.gro_receive)
goto out;
pp = call_gro_receive(ops->callbacks.gro_receive, head, skb);
diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c
index 3757fd93523f..6701a98d9a9f 100644
--- a/net/ipv4/gre_demux.c
+++ b/net/ipv4/gre_demux.c
@@ -73,7 +73,7 @@ int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING)))
return -EINVAL;
- tpi->flags = gre_flags_to_tnl_flags(greh->flags);
+ gre_flags_to_tnl_flags(tpi->flags, greh->flags);
hdr_len = gre_calc_hlen(tpi->flags);
if (!pskb_may_pull(skb, nhs + hdr_len))
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index e63a3bf99617..e1384e7331d8 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -92,6 +92,10 @@
#include <net/inet_common.h>
#include <net/ip_fib.h>
#include <net/l3mdev.h>
+#include <net/addrconf.h>
+#include <net/inet_dscp.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/icmp.h>
/*
* Build xmit assembly blocks
@@ -217,61 +221,56 @@ static inline void icmp_xmit_unlock(struct sock *sk)
spin_unlock(&sk->sk_lock.slock);
}
-int sysctl_icmp_msgs_per_sec __read_mostly = 1000;
-int sysctl_icmp_msgs_burst __read_mostly = 50;
-
-static struct {
- spinlock_t lock;
- u32 credit;
- u32 stamp;
-} icmp_global = {
- .lock = __SPIN_LOCK_UNLOCKED(icmp_global.lock),
-};
-
/**
* icmp_global_allow - Are we allowed to send one more ICMP message ?
+ * @net: network namespace
*
* Uses a token bucket to limit our ICMP messages to ~sysctl_icmp_msgs_per_sec.
* Returns false if we reached the limit and can not send another packet.
- * Note: called with BH disabled
+ * Works in tandem with icmp_global_consume().
*/
-bool icmp_global_allow(void)
+bool icmp_global_allow(struct net *net)
{
- u32 credit, delta, incr = 0, now = (u32)jiffies;
- bool rc = false;
+ u32 delta, now, oldstamp;
+ int incr, new, old;
- /* Check if token bucket is empty and cannot be refilled
- * without taking the spinlock. The READ_ONCE() are paired
- * with the following WRITE_ONCE() in this same function.
+ /* Note: many cpus could find this condition true.
+ * Then later icmp_global_consume() could consume more credits,
+ * this is an acceptable race.
*/
- if (!READ_ONCE(icmp_global.credit)) {
- delta = min_t(u32, now - READ_ONCE(icmp_global.stamp), HZ);
- if (delta < HZ / 50)
- return false;
- }
+ if (atomic_read(&net->ipv4.icmp_global_credit) > 0)
+ return true;
- spin_lock(&icmp_global.lock);
- delta = min_t(u32, now - icmp_global.stamp, HZ);
- if (delta >= HZ / 50) {
- incr = READ_ONCE(sysctl_icmp_msgs_per_sec) * delta / HZ;
- if (incr)
- WRITE_ONCE(icmp_global.stamp, now);
- }
- credit = min_t(u32, icmp_global.credit + incr,
- READ_ONCE(sysctl_icmp_msgs_burst));
- if (credit) {
- /* We want to use a credit of one in average, but need to randomize
- * it for security reasons.
- */
- credit = max_t(int, credit - get_random_u32_below(3), 0);
- rc = true;
+ now = jiffies;
+ oldstamp = READ_ONCE(net->ipv4.icmp_global_stamp);
+ delta = min_t(u32, now - oldstamp, HZ);
+ if (delta < HZ / 50)
+ return false;
+
+ incr = READ_ONCE(net->ipv4.sysctl_icmp_msgs_per_sec) * delta / HZ;
+ if (!incr)
+ return false;
+
+ if (cmpxchg(&net->ipv4.icmp_global_stamp, oldstamp, now) == oldstamp) {
+ old = atomic_read(&net->ipv4.icmp_global_credit);
+ do {
+ new = min(old + incr, READ_ONCE(net->ipv4.sysctl_icmp_msgs_burst));
+ } while (!atomic_try_cmpxchg(&net->ipv4.icmp_global_credit, &old, new));
}
- WRITE_ONCE(icmp_global.credit, credit);
- spin_unlock(&icmp_global.lock);
- return rc;
+ return true;
}
EXPORT_SYMBOL(icmp_global_allow);
+void icmp_global_consume(struct net *net)
+{
+ int credits = get_random_u32_below(3);
+
+ /* Note: this might make icmp_global.credit negative. */
+ if (credits)
+ atomic_sub(credits, &net->ipv4.icmp_global_credit);
+}
+EXPORT_SYMBOL(icmp_global_consume);
+
static bool icmpv4_mask_allow(struct net *net, int type, int code)
{
if (type > NR_ICMP_TYPES)
@@ -288,14 +287,16 @@ static bool icmpv4_mask_allow(struct net *net, int type, int code)
return false;
}
-static bool icmpv4_global_allow(struct net *net, int type, int code)
+static bool icmpv4_global_allow(struct net *net, int type, int code,
+ bool *apply_ratelimit)
{
if (icmpv4_mask_allow(net, type, code))
return true;
- if (icmp_global_allow())
+ if (icmp_global_allow(net)) {
+ *apply_ratelimit = true;
return true;
-
+ }
__ICMP_INC_STATS(net, ICMP_MIB_RATELIMITGLOBAL);
return false;
}
@@ -305,15 +306,16 @@ static bool icmpv4_global_allow(struct net *net, int type, int code)
*/
static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
- struct flowi4 *fl4, int type, int code)
+ struct flowi4 *fl4, int type, int code,
+ bool apply_ratelimit)
{
struct dst_entry *dst = &rt->dst;
struct inet_peer *peer;
bool rc = true;
int vif;
- if (icmpv4_mask_allow(net, type, code))
- goto out;
+ if (!apply_ratelimit)
+ return true;
/* No rate limit on loopback */
if (dst->dev && (dst->dev->flags&IFF_LOOPBACK))
@@ -328,6 +330,8 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
out:
if (!rc)
__ICMP_INC_STATS(net, ICMP_MIB_RATELIMITHOST);
+ else
+ icmp_global_consume(net);
return rc;
}
@@ -399,6 +403,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
struct ipcm_cookie ipc;
struct rtable *rt = skb_rtable(skb);
struct net *net = dev_net(rt->dst.dev);
+ bool apply_ratelimit = false;
struct flowi4 fl4;
struct sock *sk;
struct inet_sock *inet;
@@ -410,11 +415,11 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
if (ip_options_echo(net, &icmp_param->replyopts.opt.opt, skb))
return;
- /* Needed by both icmp_global_allow and icmp_xmit_lock */
+ /* Needed by both icmpv4_global_allow and icmp_xmit_lock */
local_bh_disable();
- /* global icmp_msgs_per_sec */
- if (!icmpv4_global_allow(net, type, code))
+ /* is global icmp_msgs_per_sec exhausted ? */
+ if (!icmpv4_global_allow(net, type, code, &apply_ratelimit))
goto out_bh_enable;
sk = icmp_xmit_lock(net);
@@ -440,14 +445,14 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
fl4.saddr = saddr;
fl4.flowi4_mark = mark;
fl4.flowi4_uid = sock_net_uid(net, NULL);
- fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
+ fl4.flowi4_tos = ip_hdr(skb)->tos & INET_DSCP_MASK;
fl4.flowi4_proto = IPPROTO_ICMP;
fl4.flowi4_oif = l3mdev_master_ifindex(skb->dev);
security_skb_classify_flow(skb, flowi4_to_flowi_common(&fl4));
rt = ip_route_output_key(net, &fl4);
if (IS_ERR(rt))
goto out_unlock;
- if (icmpv4_xrlim_allow(net, rt, &fl4, type, code))
+ if (icmpv4_xrlim_allow(net, rt, &fl4, type, code, apply_ratelimit))
icmp_push_reply(sk, icmp_param, &fl4, &ipc, &rt);
ip_rt_put(rt);
out_unlock:
@@ -482,6 +487,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
struct icmp_bxm *param)
{
struct net_device *route_lookup_dev;
+ struct dst_entry *dst, *dst2;
struct rtable *rt, *rt2;
struct flowi4 fl4_dec;
int err;
@@ -492,7 +498,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
fl4->saddr = saddr;
fl4->flowi4_mark = mark;
fl4->flowi4_uid = sock_net_uid(net, NULL);
- fl4->flowi4_tos = RT_TOS(tos);
+ fl4->flowi4_tos = tos & INET_DSCP_MASK;
fl4->flowi4_proto = IPPROTO_ICMP;
fl4->fl4_icmp_type = type;
fl4->fl4_icmp_code = code;
@@ -507,16 +513,17 @@ static struct rtable *icmp_route_lookup(struct net *net,
/* No need to clone since we're just using its address. */
rt2 = rt;
- rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
- flowi4_to_flowi(fl4), NULL, 0);
- if (!IS_ERR(rt)) {
+ dst = xfrm_lookup(net, &rt->dst,
+ flowi4_to_flowi(fl4), NULL, 0);
+ rt = dst_rtable(dst);
+ if (!IS_ERR(dst)) {
if (rt != rt2)
return rt;
- } else if (PTR_ERR(rt) == -EPERM) {
+ } else if (PTR_ERR(dst) == -EPERM) {
rt = NULL;
- } else
+ } else {
return rt;
-
+ }
err = xfrm_decode_session_reverse(net, skb_in, flowi4_to_flowi(&fl4_dec), AF_INET);
if (err)
goto relookup_failed;
@@ -540,7 +547,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
orefdst = skb_in->_skb_refdst; /* save old refdst */
skb_dst_set(skb_in, NULL);
err = ip_route_input(skb_in, fl4_dec.daddr, fl4_dec.saddr,
- RT_TOS(tos), rt2->dst.dev);
+ tos, rt2->dst.dev);
dst_release(&rt2->dst);
rt2 = skb_rtable(skb_in);
@@ -550,19 +557,19 @@ static struct rtable *icmp_route_lookup(struct net *net,
if (err)
goto relookup_failed;
- rt2 = (struct rtable *) xfrm_lookup(net, &rt2->dst,
- flowi4_to_flowi(&fl4_dec), NULL,
- XFRM_LOOKUP_ICMP);
- if (!IS_ERR(rt2)) {
+ dst2 = xfrm_lookup(net, &rt2->dst, flowi4_to_flowi(&fl4_dec), NULL,
+ XFRM_LOOKUP_ICMP);
+ rt2 = dst_rtable(dst2);
+ if (!IS_ERR(dst2)) {
dst_release(&rt->dst);
memcpy(fl4, &fl4_dec, sizeof(*fl4));
rt = rt2;
- } else if (PTR_ERR(rt2) == -EPERM) {
+ } else if (PTR_ERR(dst2) == -EPERM) {
if (rt)
dst_release(&rt->dst);
return rt2;
} else {
- err = PTR_ERR(rt2);
+ err = PTR_ERR(dst2);
goto relookup_failed;
}
return rt;
@@ -591,6 +598,7 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
int room;
struct icmp_bxm icmp_param;
struct rtable *rt = skb_rtable(skb_in);
+ bool apply_ratelimit = false;
struct ipcm_cookie ipc;
struct flowi4 fl4;
__be32 saddr;
@@ -672,7 +680,7 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
}
}
- /* Needed by both icmp_global_allow and icmp_xmit_lock */
+ /* Needed by both icmpv4_global_allow and icmp_xmit_lock */
local_bh_disable();
/* Check global sysctl_icmp_msgs_per_sec ratelimit, unless
@@ -680,7 +688,7 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
* loopback, then peer ratelimit still work (in icmpv4_xrlim_allow)
*/
if (!(skb_in->dev && (skb_in->dev->flags&IFF_LOOPBACK)) &&
- !icmpv4_global_allow(net, type, code))
+ !icmpv4_global_allow(net, type, code, &apply_ratelimit))
goto out_bh_enable;
sk = icmp_xmit_lock(net);
@@ -739,7 +747,7 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
goto out_unlock;
/* peer icmp_ratelimit */
- if (!icmpv4_xrlim_allow(net, rt, &fl4, type, code))
+ if (!icmpv4_xrlim_allow(net, rt, &fl4, type, code, apply_ratelimit))
goto ende;
/* RFC says return as much as we can without exceeding 576 bytes. */
@@ -767,6 +775,8 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
if (!fl4.saddr)
fl4.saddr = htonl(INADDR_DUMMY);
+ trace_icmp_send(skb_in, type, code);
+
icmp_push_reply(sk, &icmp_param, &fl4, &ipc, &rt);
ende:
ip_rt_put(rt);
@@ -1032,6 +1042,8 @@ bool icmp_build_probe(struct sk_buff *skb, struct icmphdr *icmphdr)
struct icmp_ext_hdr *ext_hdr, _ext_hdr;
struct icmp_ext_echo_iio *iio, _iio;
struct net *net = dev_net(skb->dev);
+ struct inet6_dev *in6_dev;
+ struct in_device *in_dev;
struct net_device *dev;
char buff[IFNAMSIZ];
u16 ident_len;
@@ -1115,10 +1127,15 @@ bool icmp_build_probe(struct sk_buff *skb, struct icmphdr *icmphdr)
/* Fill bits in reply message */
if (dev->flags & IFF_UP)
status |= ICMP_EXT_ECHOREPLY_ACTIVE;
- if (__in_dev_get_rcu(dev) && __in_dev_get_rcu(dev)->ifa_list)
+
+ in_dev = __in_dev_get_rcu(dev);
+ if (in_dev && rcu_access_pointer(in_dev->ifa_list))
status |= ICMP_EXT_ECHOREPLY_IPV4;
- if (!list_empty(&rcu_dereference(dev->ip6_ptr)->addr_list))
+
+ in6_dev = __in6_dev_get(dev);
+ if (in6_dev && !list_empty(&in6_dev->addr_list))
status |= ICMP_EXT_ECHOREPLY_IPV6;
+
dev_put(dev);
icmphdr->un.echo.sequence |= htons(status);
return true;
@@ -1473,6 +1490,8 @@ static int __net_init icmp_sk_init(struct net *net)
net->ipv4.sysctl_icmp_ratelimit = 1 * HZ;
net->ipv4.sysctl_icmp_ratemask = 0x1818;
net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr = 0;
+ net->ipv4.sysctl_icmp_msgs_per_sec = 1000;
+ net->ipv4.sysctl_icmp_msgs_burst = 50;
return 0;
}
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 717e97a389a8..9bf09de6a2e7 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1842,7 +1842,8 @@ static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr)
if (!dev) {
struct rtable *rt = ip_route_output(net,
imr->imr_multiaddr.s_addr,
- 0, 0, 0);
+ 0, 0, 0,
+ RT_SCOPE_UNIVERSE);
if (!IS_ERR(rt)) {
dev = rt->dst.dev;
ip_rt_put(rt);
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 3b38610958ee..2c5632d4fddb 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -236,7 +236,7 @@ static bool inet_bhash2_conflict(const struct sock *sk,
#define sk_for_each_bound_bhash(__sk, __tb2, __tb) \
hlist_for_each_entry(__tb2, &(__tb)->bhash2, bhash_node) \
- sk_for_each_bound(sk2, &(__tb2)->owners)
+ sk_for_each_bound((__sk), &(__tb2)->owners)
/* This should be called only when the tb and tb2 hashbuckets' locks are held */
static int inet_csk_bind_conflict(const struct sock *sk,
@@ -661,7 +661,7 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
/*
* This will accept the next outstanding connection.
*/
-struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern)
+struct sock *inet_csk_accept(struct sock *sk, struct proto_accept_arg *arg)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct request_sock_queue *queue = &icsk->icsk_accept_queue;
@@ -680,7 +680,7 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern)
/* Find already established connection */
if (reqsk_queue_empty(queue)) {
- long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
+ long timeo = sock_rcvtimeo(sk, arg->flags & O_NONBLOCK);
/* If this is a non blocking socket don't sleep */
error = -EAGAIN;
@@ -692,6 +692,7 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern)
goto out_err;
}
req = reqsk_queue_remove(queue, sk);
+ arg->is_empty = reqsk_queue_empty(queue);
newsk = req->sk;
if (sk->sk_protocol == IPPROTO_TCP &&
@@ -713,6 +714,7 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern)
out:
release_sock(sk);
if (newsk && mem_cgroup_sockets_enabled) {
+ gfp_t gfp = GFP_KERNEL | __GFP_NOFAIL;
int amt = 0;
/* atomically get the memory usage, set and charge the
@@ -730,8 +732,8 @@ out:
}
if (amt)
- mem_cgroup_charge_skmem(newsk->sk_memcg, amt,
- GFP_KERNEL | __GFP_NOFAIL);
+ mem_cgroup_charge_skmem(newsk->sk_memcg, amt, gfp);
+ kmem_cache_charge(newsk, gfp);
release_sock(newsk);
}
@@ -745,7 +747,7 @@ out:
out_err:
newsk = NULL;
req = NULL;
- *err = error;
+ arg->err = error;
goto out;
}
EXPORT_SYMBOL(inet_csk_accept);
@@ -910,6 +912,64 @@ int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req)
}
EXPORT_SYMBOL(inet_rtx_syn_ack);
+static struct request_sock *
+reqsk_alloc_noprof(const struct request_sock_ops *ops, struct sock *sk_listener,
+ bool attach_listener)
+{
+ struct request_sock *req;
+
+ req = kmem_cache_alloc_noprof(ops->slab, GFP_ATOMIC | __GFP_NOWARN);
+ if (!req)
+ return NULL;
+ req->rsk_listener = NULL;
+ if (attach_listener) {
+ if (unlikely(!refcount_inc_not_zero(&sk_listener->sk_refcnt))) {
+ kmem_cache_free(ops->slab, req);
+ return NULL;
+ }
+ req->rsk_listener = sk_listener;
+ }
+ req->rsk_ops = ops;
+ req_to_sk(req)->sk_prot = sk_listener->sk_prot;
+ sk_node_init(&req_to_sk(req)->sk_node);
+ sk_tx_queue_clear(req_to_sk(req));
+ req->saved_syn = NULL;
+ req->syncookie = 0;
+ req->timeout = 0;
+ req->num_timeout = 0;
+ req->num_retrans = 0;
+ req->sk = NULL;
+ refcount_set(&req->rsk_refcnt, 0);
+
+ return req;
+}
+#define reqsk_alloc(...) alloc_hooks(reqsk_alloc_noprof(__VA_ARGS__))
+
+struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
+ struct sock *sk_listener,
+ bool attach_listener)
+{
+ struct request_sock *req = reqsk_alloc(ops, sk_listener,
+ attach_listener);
+
+ if (req) {
+ struct inet_request_sock *ireq = inet_rsk(req);
+
+ ireq->ireq_opt = NULL;
+#if IS_ENABLED(CONFIG_IPV6)
+ ireq->pktopts = NULL;
+#endif
+ atomic64_set(&ireq->ir_cookie, 0);
+ ireq->ireq_state = TCP_NEW_SYN_RECV;
+ write_pnet(&ireq->ireq_net, sock_net(sk_listener));
+ ireq->ireq_family = sk_listener->sk_family;
+ req->timeout = TCP_TIMEOUT_INIT;
+ }
+
+ return req;
+}
+EXPORT_SYMBOL(inet_reqsk_alloc);
+
static struct request_sock *inet_reqsk_clone(struct request_sock *req,
struct sock *sk)
{
@@ -1121,25 +1181,34 @@ drop:
inet_csk_reqsk_queue_drop_and_put(oreq->rsk_listener, oreq);
}
-static void reqsk_queue_hash_req(struct request_sock *req,
+static bool reqsk_queue_hash_req(struct request_sock *req,
unsigned long timeout)
{
+ bool found_dup_sk = false;
+
+ if (!inet_ehash_insert(req_to_sk(req), NULL, &found_dup_sk))
+ return false;
+
+ /* The timer needs to be setup after a successful insertion. */
timer_setup(&req->rsk_timer, reqsk_timer_handler, TIMER_PINNED);
mod_timer(&req->rsk_timer, jiffies + timeout);
- inet_ehash_insert(req_to_sk(req), NULL, NULL);
/* before letting lookups find us, make sure all req fields
* are committed to memory and refcnt initialized.
*/
smp_wmb();
refcount_set(&req->rsk_refcnt, 2 + 1);
+ return true;
}
-void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
+bool inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
unsigned long timeout)
{
- reqsk_queue_hash_req(req, timeout);
+ if (!reqsk_queue_hash_req(req, timeout))
+ return false;
+
inet_csk_reqsk_queue_added(sk);
+ return true;
}
EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 7adace541fe2..67639309163d 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -442,7 +442,7 @@ static int inet_twsk_diag_fill(struct sock *sk,
inet_diag_msg_common_fill(r, sk);
r->idiag_retrans = 0;
- r->idiag_state = tw->tw_substate;
+ r->idiag_state = READ_ONCE(tw->tw_substate);
r->idiag_timer = 3;
tmo = tw->tw_timer.expires - jiffies;
r->idiag_expires = jiffies_delta_to_msecs(tmo);
@@ -1209,7 +1209,7 @@ next_chunk:
if (num < s_num)
goto next_normal;
state = (sk->sk_state == TCP_TIME_WAIT) ?
- inet_twsk(sk)->tw_substate : sk->sk_state;
+ READ_ONCE(inet_twsk(sk)->tw_substate) : sk->sk_state;
if (!(idiag_states & (1 << state)))
goto next_normal;
if (r->sdiag_family != AF_UNSPEC &&
@@ -1383,6 +1383,7 @@ static int inet_diag_dump_compat(struct sk_buff *skb,
req.sdiag_family = AF_UNSPEC; /* compatibility */
req.sdiag_protocol = inet_diag_type2proto(cb->nlh->nlmsg_type);
req.idiag_ext = rc->idiag_ext;
+ req.pad = 0;
req.idiag_states = rc->idiag_states;
req.id = rc->id;
@@ -1398,6 +1399,7 @@ static int inet_diag_get_exact_compat(struct sk_buff *in_skb,
req.sdiag_family = rc->idiag_family;
req.sdiag_protocol = inet_diag_type2proto(nlh->nlmsg_type);
req.idiag_ext = rc->idiag_ext;
+ req.pad = 0;
req.idiag_states = rc->idiag_states;
req.id = rc->id;
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index c88c9034d630..d179a2c84222 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -175,7 +175,7 @@ static void fqdir_free_fn(struct work_struct *work)
}
}
-static DECLARE_WORK(fqdir_free_work, fqdir_free_fn);
+static DECLARE_DELAYED_WORK(fqdir_free_work, fqdir_free_fn);
static void fqdir_work_fn(struct work_struct *work)
{
@@ -184,7 +184,7 @@ static void fqdir_work_fn(struct work_struct *work)
rhashtable_free_and_destroy(&fqdir->rhashtable, inet_frags_free_cb, NULL);
if (llist_add(&fqdir->free_list, &fqdir_free_list))
- queue_work(system_wq, &fqdir_free_work);
+ queue_delayed_work(system_wq, &fqdir_free_work, HZ);
}
int fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, struct net *net)
@@ -619,7 +619,7 @@ void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head,
skb_mark_not_on_list(head);
head->prev = NULL;
head->tstamp = q->stamp;
- head->mono_delivery_time = q->mono_delivery_time;
+ head->tstamp_type = q->tstamp_type;
if (sk)
refcount_add(sum_truesize - head_truesize, &sk->sk_wmem_alloc);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index cf88eca5f1b4..9bfcfd016e18 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -310,7 +310,7 @@ inet_lhash2_bucket_sk(struct inet_hashinfo *h, struct sock *sk)
return inet_lhash2_bucket(h, hash);
}
-static inline int compute_score(struct sock *sk, struct net *net,
+static inline int compute_score(struct sock *sk, const struct net *net,
const unsigned short hnum, const __be32 daddr,
const int dif, const int sdif)
{
@@ -348,7 +348,7 @@ static inline int compute_score(struct sock *sk, struct net *net,
* Return: NULL if sk doesn't have SO_REUSEPORT set, otherwise a pointer to
* the selected sock or an error.
*/
-struct sock *inet_lookup_reuseport(struct net *net, struct sock *sk,
+struct sock *inet_lookup_reuseport(const struct net *net, struct sock *sk,
struct sk_buff *skb, int doff,
__be32 saddr, __be16 sport,
__be32 daddr, unsigned short hnum,
@@ -374,7 +374,7 @@ EXPORT_SYMBOL_GPL(inet_lookup_reuseport);
*/
/* called with rcu_read_lock() : No refcount taken on the socket */
-static struct sock *inet_lhash2_lookup(struct net *net,
+static struct sock *inet_lhash2_lookup(const struct net *net,
struct inet_listen_hashbucket *ilb2,
struct sk_buff *skb, int doff,
const __be32 saddr, __be16 sport,
@@ -401,7 +401,7 @@ static struct sock *inet_lhash2_lookup(struct net *net,
return result;
}
-struct sock *inet_lookup_run_sk_lookup(struct net *net,
+struct sock *inet_lookup_run_sk_lookup(const struct net *net,
int protocol,
struct sk_buff *skb, int doff,
__be32 saddr, __be16 sport,
@@ -423,7 +423,7 @@ struct sock *inet_lookup_run_sk_lookup(struct net *net,
return sk;
}
-struct sock *__inet_lookup_listener(struct net *net,
+struct sock *__inet_lookup_listener(const struct net *net,
struct inet_hashinfo *hashinfo,
struct sk_buff *skb, int doff,
const __be32 saddr, __be16 sport,
@@ -488,7 +488,7 @@ void sock_edemux(struct sk_buff *skb)
}
EXPORT_SYMBOL(sock_edemux);
-struct sock *__inet_lookup_established(struct net *net,
+struct sock *__inet_lookup_established(const struct net *net,
struct inet_hashinfo *hashinfo,
const __be32 saddr, const __be16 sport,
const __be32 daddr, const u16 hnum,
@@ -565,7 +565,8 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
if (likely(inet_match(net, sk2, acookie, ports, dif, sdif))) {
if (sk2->sk_state == TCP_TIME_WAIT) {
tw = inet_twsk(sk2);
- if (twsk_unique(sk, sk2, twp))
+ if (sk->sk_protocol == IPPROTO_TCP &&
+ tcp_twsk_unique(sk, sk2, twp))
break;
}
goto not_unique;
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index e8de45d34d56..337390ba85b4 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -92,13 +92,22 @@ static void inet_twsk_add_node_rcu(struct inet_timewait_sock *tw,
hlist_nulls_add_head_rcu(&tw->tw_node, list);
}
+static void inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo)
+{
+ __inet_twsk_schedule(tw, timeo, false);
+}
+
/*
- * Enter the time wait state. This is called with locally disabled BH.
+ * Enter the time wait state.
* Essentially we whip up a timewait bucket, copy the relevant info into it
* from the SK, and mess with hash chains and list linkage.
+ *
+ * The caller must not access @tw anymore after this function returns.
*/
-void inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
- struct inet_hashinfo *hashinfo)
+void inet_twsk_hashdance_schedule(struct inet_timewait_sock *tw,
+ struct sock *sk,
+ struct inet_hashinfo *hashinfo,
+ int timeo)
{
const struct inet_sock *inet = inet_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -114,6 +123,7 @@ void inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
hashinfo->bhash_size)];
bhead2 = inet_bhashfn_portaddr(hashinfo, sk, twsk_net(tw), inet->inet_num);
+ local_bh_disable();
spin_lock(&bhead->lock);
spin_lock(&bhead2->lock);
@@ -129,26 +139,34 @@ void inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
spin_lock(lock);
+ /* Step 2: Hash TW into tcp ehash chain */
inet_twsk_add_node_rcu(tw, &ehead->chain);
/* Step 3: Remove SK from hash chain */
if (__sk_nulls_del_node_init_rcu(sk))
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
- spin_unlock(lock);
+ /* Ensure above writes are committed into memory before updating the
+ * refcount.
+ * Provides ordering vs later refcount_inc().
+ */
+ smp_wmb();
/* tw_refcnt is set to 3 because we have :
* - one reference for bhash chain.
* - one reference for ehash chain.
* - one reference for timer.
- * We can use atomic_set() because prior spin_lock()/spin_unlock()
- * committed into memory all tw fields.
* Also note that after this point, we lost our implicit reference
* so we are not allowed to use tw anymore.
*/
refcount_set(&tw->tw_refcnt, 3);
+
+ inet_twsk_schedule(tw, timeo);
+
+ spin_unlock(lock);
+ local_bh_enable();
}
-EXPORT_SYMBOL_GPL(inet_twsk_hashdance);
+EXPORT_SYMBOL_GPL(inet_twsk_hashdance_schedule);
static void tw_timer_handler(struct timer_list *t)
{
@@ -192,7 +210,7 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
tw->tw_prot = sk->sk_prot_creator;
atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie));
twsk_net_set(tw, sock_net(sk));
- timer_setup(&tw->tw_timer, tw_timer_handler, TIMER_PINNED);
+ timer_setup(&tw->tw_timer, tw_timer_handler, 0);
/*
* Because we use RCU lookups, we should not set tw_refcnt
* to a non null value before everything is setup for this
@@ -217,7 +235,34 @@ EXPORT_SYMBOL_GPL(inet_twsk_alloc);
*/
void inet_twsk_deschedule_put(struct inet_timewait_sock *tw)
{
- if (del_timer_sync(&tw->tw_timer))
+ struct inet_hashinfo *hashinfo = tw->tw_dr->hashinfo;
+ spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash);
+
+ /* inet_twsk_purge() walks over all sockets, including tw ones,
+ * and removes them via inet_twsk_deschedule_put() after a
+ * refcount_inc_not_zero().
+ *
+ * inet_twsk_hashdance_schedule() must (re)init the refcount before
+ * arming the timer, i.e. inet_twsk_purge can obtain a reference to
+ * a twsk that did not yet schedule the timer.
+ *
+ * The ehash lock synchronizes these two:
+ * After acquiring the lock, the timer is always scheduled (else
+ * timer_shutdown returns false), because hashdance_schedule releases
+ * the ehash lock only after completing the timer initialization.
+ *
+ * Without grabbing the ehash lock, we get:
+ * 1) cpu x sets twsk refcount to 3
+ * 2) cpu y bumps refcount to 4
+ * 3) cpu y calls inet_twsk_deschedule_put() and shuts timer down
+ * 4) cpu x tries to start timer, but mod_timer is a noop post-shutdown
+ * -> timer refcount is never decremented.
+ */
+ spin_lock(lock);
+ /* Makes sure hashdance_schedule() has completed */
+ spin_unlock(lock);
+
+ if (timer_shutdown_sync(&tw->tw_timer))
inet_twsk_kill(tw);
inet_twsk_put(tw);
}
@@ -264,14 +309,18 @@ void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm)
EXPORT_SYMBOL_GPL(__inet_twsk_schedule);
/* Remove all non full sockets (TIME_WAIT and NEW_SYN_RECV) for dead netns */
-void inet_twsk_purge(struct inet_hashinfo *hashinfo, int family)
+void inet_twsk_purge(struct inet_hashinfo *hashinfo)
{
+ struct inet_ehash_bucket *head = &hashinfo->ehash[0];
+ unsigned int ehash_mask = hashinfo->ehash_mask;
struct hlist_nulls_node *node;
unsigned int slot;
struct sock *sk;
- for (slot = 0; slot <= hashinfo->ehash_mask; slot++) {
- struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
+ for (slot = 0; slot <= ehash_mask; slot++, head++) {
+ if (hlist_nulls_empty(&head->chain))
+ continue;
+
restart_rcu:
cond_resched();
rcu_read_lock();
@@ -283,15 +332,13 @@ restart:
TCPF_NEW_SYN_RECV))
continue;
- if (sk->sk_family != family ||
- refcount_read(&sock_net(sk)->ns.count))
+ if (refcount_read(&sock_net(sk)->ns.count))
continue;
if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt)))
continue;
- if (unlikely(sk->sk_family != family ||
- refcount_read(&sock_net(sk)->ns.count))) {
+ if (refcount_read(&sock_net(sk)->ns.count)) {
sock_gen_put(sk);
goto restart;
}
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index fb947d1613fe..a92664a5ef2e 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -355,7 +355,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
qp->iif = dev->ifindex;
qp->q.stamp = skb->tstamp;
- qp->q.mono_delivery_time = skb->mono_delivery_time;
+ qp->q.tstamp_type = skb->tstamp_type;
qp->q.meat += skb->len;
qp->ecn |= ecn;
add_frag_mem_limit(qp->q.fqdir, skb->truesize);
@@ -580,7 +580,6 @@ static struct ctl_table ip4_frags_ns_ctl_table[] = {
.proc_handler = proc_dointvec_minmax,
.extra1 = &dist_min,
},
- { }
};
/* secret interval has been deprecated */
@@ -593,7 +592,6 @@ static struct ctl_table ip4_frags_ctl_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
- { }
};
static int __net_init ip4_frags_ns_ctl_register(struct net *net)
@@ -632,7 +630,7 @@ err_alloc:
static void __net_exit ip4_frags_ns_ctl_unregister(struct net *net)
{
- struct ctl_table *table;
+ const struct ctl_table *table;
table = net->ipv4.frags_hdr->ctl_table_arg;
unregister_net_sysctl_table(net->ipv4.frags_hdr);
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 57ddcd8c62f6..5f6fd382af38 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -44,6 +44,7 @@
#include <net/gre.h>
#include <net/dst_metadata.h>
#include <net/erspan.h>
+#include <net/inet_dscp.h>
/*
Problems & solutions
@@ -265,6 +266,7 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
struct net *net = dev_net(skb->dev);
struct metadata_dst *tun_dst = NULL;
struct erspan_base_hdr *ershdr;
+ IP_TUNNEL_DECLARE_FLAGS(flags);
struct ip_tunnel_net *itn;
struct ip_tunnel *tunnel;
const struct iphdr *iph;
@@ -272,12 +274,14 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
int ver;
int len;
+ ip_tunnel_flags_copy(flags, tpi->flags);
+
itn = net_generic(net, erspan_net_id);
iph = ip_hdr(skb);
if (is_erspan_type1(gre_hdr_len)) {
ver = 0;
- tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex,
- tpi->flags | TUNNEL_NO_KEY,
+ __set_bit(IP_TUNNEL_NO_KEY_BIT, flags);
+ tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, flags,
iph->saddr, iph->daddr, 0);
} else {
if (unlikely(!pskb_may_pull(skb,
@@ -287,8 +291,8 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
ershdr = (struct erspan_base_hdr *)(skb->data + gre_hdr_len);
ver = ershdr->ver;
iph = ip_hdr(skb);
- tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex,
- tpi->flags | TUNNEL_KEY,
+ __set_bit(IP_TUNNEL_KEY_BIT, flags);
+ tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, flags,
iph->saddr, iph->daddr, tpi->key);
}
@@ -312,10 +316,9 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
struct ip_tunnel_info *info;
unsigned char *gh;
__be64 tun_id;
- __be16 flags;
- tpi->flags |= TUNNEL_KEY;
- flags = tpi->flags;
+ __set_bit(IP_TUNNEL_KEY_BIT, tpi->flags);
+ ip_tunnel_flags_copy(flags, tpi->flags);
tun_id = key32_to_tunnel_id(tpi->key);
tun_dst = ip_tun_rx_dst(skb, flags,
@@ -338,7 +341,8 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
ERSPAN_V2_MDSIZE);
info = &tun_dst->u.tun_info;
- info->key.tun_flags |= TUNNEL_ERSPAN_OPT;
+ __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT,
+ info->key.tun_flags);
info->options_len = sizeof(*md);
}
@@ -381,10 +385,13 @@ static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
tnl_params = &tunnel->parms.iph;
if (tunnel->collect_md || tnl_params->daddr == 0) {
- __be16 flags;
+ IP_TUNNEL_DECLARE_FLAGS(flags) = { };
__be64 tun_id;
- flags = tpi->flags & (TUNNEL_CSUM | TUNNEL_KEY);
+ __set_bit(IP_TUNNEL_CSUM_BIT, flags);
+ __set_bit(IP_TUNNEL_KEY_BIT, flags);
+ ip_tunnel_flags_and(flags, tpi->flags, flags);
+
tun_id = key32_to_tunnel_id(tpi->key);
tun_dst = ip_tun_rx_dst(skb, flags, tun_id, 0);
if (!tun_dst)
@@ -464,12 +471,15 @@ static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
__be16 proto)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
- __be16 flags = tunnel->parms.o_flags;
+ IP_TUNNEL_DECLARE_FLAGS(flags);
+
+ ip_tunnel_flags_copy(flags, tunnel->parms.o_flags);
/* Push GRE header. */
gre_build_header(skb, tunnel->tun_hlen,
flags, proto, tunnel->parms.o_key,
- (flags & TUNNEL_SEQ) ? htonl(atomic_fetch_inc(&tunnel->o_seqno)) : 0);
+ test_bit(IP_TUNNEL_SEQ_BIT, flags) ?
+ htonl(atomic_fetch_inc(&tunnel->o_seqno)) : 0);
ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol);
}
@@ -483,10 +493,10 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev,
__be16 proto)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
+ IP_TUNNEL_DECLARE_FLAGS(flags) = { };
struct ip_tunnel_info *tun_info;
const struct ip_tunnel_key *key;
int tunnel_hlen;
- __be16 flags;
tun_info = skb_tunnel_info(skb);
if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
@@ -500,14 +510,19 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev,
goto err_free_skb;
/* Push Tunnel header. */
- if (gre_handle_offloads(skb, !!(tun_info->key.tun_flags & TUNNEL_CSUM)))
+ if (gre_handle_offloads(skb, test_bit(IP_TUNNEL_CSUM_BIT,
+ tunnel->parms.o_flags)))
goto err_free_skb;
- flags = tun_info->key.tun_flags &
- (TUNNEL_CSUM | TUNNEL_KEY | TUNNEL_SEQ);
+ __set_bit(IP_TUNNEL_CSUM_BIT, flags);
+ __set_bit(IP_TUNNEL_KEY_BIT, flags);
+ __set_bit(IP_TUNNEL_SEQ_BIT, flags);
+ ip_tunnel_flags_and(flags, tun_info->key.tun_flags, flags);
+
gre_build_header(skb, tunnel_hlen, flags, proto,
tunnel_id_to_key32(tun_info->key.tun_id),
- (flags & TUNNEL_SEQ) ? htonl(atomic_fetch_inc(&tunnel->o_seqno)) : 0);
+ test_bit(IP_TUNNEL_SEQ_BIT, flags) ?
+ htonl(atomic_fetch_inc(&tunnel->o_seqno)) : 0);
ip_md_tunnel_xmit(skb, dev, IPPROTO_GRE, tunnel_hlen);
@@ -521,6 +536,7 @@ err_free_skb:
static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
+ IP_TUNNEL_DECLARE_FLAGS(flags) = { };
struct ip_tunnel_info *tun_info;
const struct ip_tunnel_key *key;
struct erspan_metadata *md;
@@ -536,7 +552,7 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev)
goto err_free_skb;
key = &tun_info->key;
- if (!(tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT))
+ if (!test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, tun_info->key.tun_flags))
goto err_free_skb;
if (tun_info->options_len < sizeof(*md))
goto err_free_skb;
@@ -589,8 +605,9 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev)
goto err_free_skb;
}
- gre_build_header(skb, 8, TUNNEL_SEQ,
- proto, 0, htonl(atomic_fetch_inc(&tunnel->o_seqno)));
+ __set_bit(IP_TUNNEL_SEQ_BIT, flags);
+ gre_build_header(skb, 8, flags, proto, 0,
+ htonl(atomic_fetch_inc(&tunnel->o_seqno)));
ip_md_tunnel_xmit(skb, dev, IPPROTO_GRE, tunnel_hlen);
@@ -664,7 +681,8 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
tnl_params = &tunnel->parms.iph;
}
- if (gre_handle_offloads(skb, !!(tunnel->parms.o_flags & TUNNEL_CSUM)))
+ if (gre_handle_offloads(skb, test_bit(IP_TUNNEL_CSUM_BIT,
+ tunnel->parms.o_flags)))
goto free_skb;
__gre_xmit(skb, dev, tnl_params, skb->protocol);
@@ -706,7 +724,7 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb,
/* Push ERSPAN header */
if (tunnel->erspan_ver == 0) {
proto = htons(ETH_P_ERSPAN);
- tunnel->parms.o_flags &= ~TUNNEL_SEQ;
+ __clear_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.o_flags);
} else if (tunnel->erspan_ver == 1) {
erspan_build_header(skb, ntohl(tunnel->parms.o_key),
tunnel->index,
@@ -721,7 +739,7 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb,
goto free_skb;
}
- tunnel->parms.o_flags &= ~TUNNEL_KEY;
+ __clear_bit(IP_TUNNEL_KEY_BIT, tunnel->parms.o_flags);
__gre_xmit(skb, dev, &tunnel->parms.iph, proto);
return NETDEV_TX_OK;
@@ -744,7 +762,8 @@ static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,
return NETDEV_TX_OK;
}
- if (gre_handle_offloads(skb, !!(tunnel->parms.o_flags & TUNNEL_CSUM)))
+ if (gre_handle_offloads(skb, test_bit(IP_TUNNEL_CSUM_BIT,
+ tunnel->parms.o_flags)))
goto free_skb;
if (skb_cow_head(skb, dev->needed_headroom))
@@ -762,7 +781,6 @@ free_skb:
static void ipgre_link_update(struct net_device *dev, bool set_mtu)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
- __be16 flags;
int len;
len = tunnel->tun_hlen;
@@ -776,12 +794,11 @@ static void ipgre_link_update(struct net_device *dev, bool set_mtu)
dev->needed_headroom += len;
if (set_mtu)
- dev->mtu = max_t(int, dev->mtu - len, 68);
+ WRITE_ONCE(dev->mtu, max_t(int, dev->mtu - len, 68));
- flags = tunnel->parms.o_flags;
-
- if (flags & TUNNEL_SEQ ||
- (flags & TUNNEL_CSUM && tunnel->encap.type != TUNNEL_ENCAP_NONE)) {
+ if (test_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.o_flags) ||
+ (test_bit(IP_TUNNEL_CSUM_BIT, tunnel->parms.o_flags) &&
+ tunnel->encap.type != TUNNEL_ENCAP_NONE)) {
dev->features &= ~NETIF_F_GSO_SOFTWARE;
dev->hw_features &= ~NETIF_F_GSO_SOFTWARE;
} else {
@@ -790,20 +807,29 @@ static void ipgre_link_update(struct net_device *dev, bool set_mtu)
}
}
-static int ipgre_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p,
+static int ipgre_tunnel_ctl(struct net_device *dev,
+ struct ip_tunnel_parm_kern *p,
int cmd)
{
+ __be16 i_flags, o_flags;
int err;
+ if (!ip_tunnel_flags_is_be16_compat(p->i_flags) ||
+ !ip_tunnel_flags_is_be16_compat(p->o_flags))
+ return -EOVERFLOW;
+
+ i_flags = ip_tunnel_flags_to_be16(p->i_flags);
+ o_flags = ip_tunnel_flags_to_be16(p->o_flags);
+
if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) {
if (p->iph.version != 4 || p->iph.protocol != IPPROTO_GRE ||
p->iph.ihl != 5 || (p->iph.frag_off & htons(~IP_DF)) ||
- ((p->i_flags | p->o_flags) & (GRE_VERSION | GRE_ROUTING)))
+ ((i_flags | o_flags) & (GRE_VERSION | GRE_ROUTING)))
return -EINVAL;
}
- p->i_flags = gre_flags_to_tnl_flags(p->i_flags);
- p->o_flags = gre_flags_to_tnl_flags(p->o_flags);
+ gre_flags_to_tnl_flags(p->i_flags, i_flags);
+ gre_flags_to_tnl_flags(p->o_flags, o_flags);
err = ip_tunnel_ctl(dev, p, cmd);
if (err)
@@ -812,15 +838,18 @@ static int ipgre_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p,
if (cmd == SIOCCHGTUNNEL) {
struct ip_tunnel *t = netdev_priv(dev);
- t->parms.i_flags = p->i_flags;
- t->parms.o_flags = p->o_flags;
+ ip_tunnel_flags_copy(t->parms.i_flags, p->i_flags);
+ ip_tunnel_flags_copy(t->parms.o_flags, p->o_flags);
if (strcmp(dev->rtnl_link_ops->kind, "erspan"))
ipgre_link_update(dev, true);
}
- p->i_flags = gre_tnl_flags_to_gre_flags(p->i_flags);
- p->o_flags = gre_tnl_flags_to_gre_flags(p->o_flags);
+ i_flags = gre_tnl_flags_to_gre_flags(p->i_flags);
+ ip_tunnel_flags_from_be16(p->i_flags, i_flags);
+ o_flags = gre_tnl_flags_to_gre_flags(p->o_flags);
+ ip_tunnel_flags_from_be16(p->o_flags, o_flags);
+
return 0;
}
@@ -902,7 +931,7 @@ static int ipgre_open(struct net_device *dev)
t->parms.iph.daddr,
t->parms.iph.saddr,
t->parms.o_key,
- RT_TOS(t->parms.iph.tos),
+ t->parms.iph.tos & INET_DSCP_MASK,
t->parms.link);
if (IS_ERR(rt))
return -EADDRNOTAVAIL;
@@ -960,7 +989,6 @@ static void ipgre_tunnel_setup(struct net_device *dev)
static void __gre_tunnel_init(struct net_device *dev)
{
struct ip_tunnel *tunnel;
- __be16 flags;
tunnel = netdev_priv(dev);
tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags);
@@ -969,21 +997,22 @@ static void __gre_tunnel_init(struct net_device *dev)
tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen;
dev->needed_headroom = tunnel->hlen + sizeof(tunnel->parms.iph);
- dev->features |= GRE_FEATURES | NETIF_F_LLTX;
+ dev->features |= GRE_FEATURES;
dev->hw_features |= GRE_FEATURES;
- flags = tunnel->parms.o_flags;
-
/* TCP offload with GRE SEQ is not supported, nor can we support 2
* levels of outer headers requiring an update.
*/
- if (flags & TUNNEL_SEQ)
+ if (test_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.o_flags))
return;
- if (flags & TUNNEL_CSUM && tunnel->encap.type != TUNNEL_ENCAP_NONE)
+ if (test_bit(IP_TUNNEL_CSUM_BIT, tunnel->parms.o_flags) &&
+ tunnel->encap.type != TUNNEL_ENCAP_NONE)
return;
dev->features |= NETIF_F_GSO_SOFTWARE;
dev->hw_features |= NETIF_F_GSO_SOFTWARE;
+
+ dev->lltx = true;
}
static int ipgre_tunnel_init(struct net_device *dev)
@@ -1136,7 +1165,7 @@ static int erspan_validate(struct nlattr *tb[], struct nlattr *data[],
static int ipgre_netlink_parms(struct net_device *dev,
struct nlattr *data[],
struct nlattr *tb[],
- struct ip_tunnel_parm *parms,
+ struct ip_tunnel_parm_kern *parms,
__u32 *fwmark)
{
struct ip_tunnel *t = netdev_priv(dev);
@@ -1152,10 +1181,12 @@ static int ipgre_netlink_parms(struct net_device *dev,
parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
if (data[IFLA_GRE_IFLAGS])
- parms->i_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_IFLAGS]));
+ gre_flags_to_tnl_flags(parms->i_flags,
+ nla_get_be16(data[IFLA_GRE_IFLAGS]));
if (data[IFLA_GRE_OFLAGS])
- parms->o_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_OFLAGS]));
+ gre_flags_to_tnl_flags(parms->o_flags,
+ nla_get_be16(data[IFLA_GRE_OFLAGS]));
if (data[IFLA_GRE_IKEY])
parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
@@ -1203,7 +1234,7 @@ static int ipgre_netlink_parms(struct net_device *dev,
static int erspan_netlink_parms(struct net_device *dev,
struct nlattr *data[],
struct nlattr *tb[],
- struct ip_tunnel_parm *parms,
+ struct ip_tunnel_parm_kern *parms,
__u32 *fwmark)
{
struct ip_tunnel *t = netdev_priv(dev);
@@ -1362,7 +1393,7 @@ static int ipgre_newlink(struct net *src_net, struct net_device *dev,
struct nlattr *tb[], struct nlattr *data[],
struct netlink_ext_ack *extack)
{
- struct ip_tunnel_parm p;
+ struct ip_tunnel_parm_kern p;
__u32 fwmark = 0;
int err;
@@ -1380,7 +1411,7 @@ static int erspan_newlink(struct net *src_net, struct net_device *dev,
struct nlattr *tb[], struct nlattr *data[],
struct netlink_ext_ack *extack)
{
- struct ip_tunnel_parm p;
+ struct ip_tunnel_parm_kern p;
__u32 fwmark = 0;
int err;
@@ -1399,8 +1430,8 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
struct netlink_ext_ack *extack)
{
struct ip_tunnel *t = netdev_priv(dev);
+ struct ip_tunnel_parm_kern p;
__u32 fwmark = t->fwmark;
- struct ip_tunnel_parm p;
int err;
err = ipgre_newlink_encap_setup(dev, data);
@@ -1415,8 +1446,8 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
if (err < 0)
return err;
- t->parms.i_flags = p.i_flags;
- t->parms.o_flags = p.o_flags;
+ ip_tunnel_flags_copy(t->parms.i_flags, p.i_flags);
+ ip_tunnel_flags_copy(t->parms.o_flags, p.o_flags);
ipgre_link_update(dev, !tb[IFLA_MTU]);
@@ -1428,8 +1459,8 @@ static int erspan_changelink(struct net_device *dev, struct nlattr *tb[],
struct netlink_ext_ack *extack)
{
struct ip_tunnel *t = netdev_priv(dev);
+ struct ip_tunnel_parm_kern p;
__u32 fwmark = t->fwmark;
- struct ip_tunnel_parm p;
int err;
err = ipgre_newlink_encap_setup(dev, data);
@@ -1444,8 +1475,8 @@ static int erspan_changelink(struct net_device *dev, struct nlattr *tb[],
if (err < 0)
return err;
- t->parms.i_flags = p.i_flags;
- t->parms.o_flags = p.o_flags;
+ ip_tunnel_flags_copy(t->parms.i_flags, p.i_flags);
+ ip_tunnel_flags_copy(t->parms.o_flags, p.o_flags);
return 0;
}
@@ -1501,8 +1532,10 @@ static size_t ipgre_get_size(const struct net_device *dev)
static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
{
struct ip_tunnel *t = netdev_priv(dev);
- struct ip_tunnel_parm *p = &t->parms;
- __be16 o_flags = p->o_flags;
+ struct ip_tunnel_parm_kern *p = &t->parms;
+ IP_TUNNEL_DECLARE_FLAGS(o_flags);
+
+ ip_tunnel_flags_copy(o_flags, p->o_flags);
if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
nla_put_be16(skb, IFLA_GRE_IFLAGS,
@@ -1550,7 +1583,7 @@ static int erspan_fill_info(struct sk_buff *skb, const struct net_device *dev)
if (t->erspan_ver <= 2) {
if (t->erspan_ver != 0 && !t->collect_md)
- t->parms.o_flags |= TUNNEL_KEY;
+ __set_bit(IP_TUNNEL_KEY_BIT, t->parms.o_flags);
if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, t->erspan_ver))
goto nla_put_failure;
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 5e9c8156656a..b6e7d4921309 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -596,9 +596,8 @@ static void ip_list_rcv_finish(struct net *net, struct sock *sk,
{
struct sk_buff *skb, *next, *hint = NULL;
struct dst_entry *curr_dst = NULL;
- struct list_head sublist;
+ LIST_HEAD(sublist);
- INIT_LIST_HEAD(&sublist);
list_for_each_entry_safe(skb, next, head, list) {
struct net_device *dev = skb->dev;
struct dst_entry *dst;
@@ -616,7 +615,7 @@ static void ip_list_rcv_finish(struct net *net, struct sock *sk,
dst = skb_dst(skb);
if (curr_dst != dst) {
hint = ip_extract_route_hint(net, skb,
- ((struct rtable *)dst)->rt_type);
+ dst_rtable(dst)->rt_type);
/* dispatch old sublist */
if (!list_empty(&sublist))
@@ -646,9 +645,8 @@ void ip_list_rcv(struct list_head *head, struct packet_type *pt,
struct net_device *curr_dev = NULL;
struct net *curr_net = NULL;
struct sk_buff *skb, *next;
- struct list_head sublist;
+ LIST_HEAD(sublist);
- INIT_LIST_HEAD(&sublist);
list_for_each_entry_safe(skb, next, head, list) {
struct net_device *dev = skb->dev;
struct net *net = dev_net(dev);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 1fe794967211..49811c9281d4 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -77,6 +77,7 @@
#include <net/inetpeer.h>
#include <net/inet_ecn.h>
#include <net/lwtunnel.h>
+#include <net/inet_dscp.h>
#include <linux/bpf-cgroup.h>
#include <linux/igmp.h>
#include <linux/netfilter_ipv4.h>
@@ -198,7 +199,7 @@ EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
- struct rtable *rt = (struct rtable *)dst;
+ struct rtable *rt = dst_rtable(dst);
struct net_device *dev = dst->dev;
unsigned int hh_len = LL_RESERVED_SPACE(dev);
struct neighbour *neigh;
@@ -475,7 +476,7 @@ int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
goto packet_routed;
/* Make sure we can route this packet. */
- rt = (struct rtable *)__sk_dst_check(sk, 0);
+ rt = dst_rtable(__sk_dst_check(sk, 0));
if (!rt) {
__be32 daddr;
@@ -493,7 +494,7 @@ int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
inet->inet_dport,
inet->inet_sport,
sk->sk_protocol,
- RT_TOS(tos),
+ tos & INET_DSCP_MASK,
sk->sk_bound_dev_if);
if (IS_ERR(rt))
goto no_route;
@@ -764,7 +765,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
{
struct iphdr *iph;
struct sk_buff *skb2;
- bool mono_delivery_time = skb->mono_delivery_time;
+ u8 tstamp_type = skb->tstamp_type;
struct rtable *rt = skb_rtable(skb);
unsigned int mtu, hlen, ll_rs;
struct ip_fraglist_iter iter;
@@ -856,7 +857,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
}
}
- skb_set_delivery_time(skb, tstamp, mono_delivery_time);
+ skb_set_delivery_time(skb, tstamp, tstamp_type);
err = output(net, sk, skb);
if (!err)
@@ -912,7 +913,7 @@ slow_path:
/*
* Put this fragment into the sending queue.
*/
- skb_set_delivery_time(skb2, tstamp, mono_delivery_time);
+ skb_set_delivery_time(skb2, tstamp, tstamp_type);
err = output(net, sk, skb2);
if (err)
goto fail;
@@ -971,7 +972,7 @@ static int __ip_append_data(struct sock *sk,
bool zc = false;
unsigned int maxfraglen, fragheaderlen, maxnonfragsize;
int csummode = CHECKSUM_NONE;
- struct rtable *rt = (struct rtable *)cork->dst;
+ struct rtable *rt = dst_rtable(cork->dst);
bool paged, hold_tskey, extra_uref = false;
unsigned int wmem_alloc_delta = 0;
u32 tskey = 0;
@@ -1390,7 +1391,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
struct inet_sock *inet = inet_sk(sk);
struct net *net = sock_net(sk);
struct ip_options *opt = NULL;
- struct rtable *rt = (struct rtable *)cork->dst;
+ struct rtable *rt = dst_rtable(cork->dst);
struct iphdr *iph;
u8 pmtudisc, ttl;
__be16 df = 0;
@@ -1457,7 +1458,10 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
skb->priority = (cork->tos != -1) ? cork->priority: READ_ONCE(sk->sk_priority);
skb->mark = cork->mark;
- skb->tstamp = cork->transmit_time;
+ if (sk_is_tcp(sk))
+ skb_set_delivery_time(skb, cork->transmit_time, SKB_CLOCK_MONOTONIC);
+ else
+ skb_set_delivery_type_by_clockid(skb, cork->transmit_time, sk->sk_clockid);
/*
* Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
* on dst refcount
@@ -1473,7 +1477,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
* by icmp_hdr(skb)->type.
*/
if (sk->sk_type == SOCK_RAW &&
- !inet_test_bit(HDRINCL, sk))
+ !(fl4->flowi4_flags & FLOWI_FLAG_KNOWN_NH))
icmp_type = fl4->fl4_icmp_type;
else
icmp_type = icmp_hdr(skb)->type;
@@ -1618,7 +1622,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
flowi4_init_output(&fl4, oif,
IP4_REPLY_MARK(net, skb->mark) ?: sk->sk_mark,
- RT_TOS(arg->tos),
+ arg->tos & INET_DSCP_MASK,
RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,
ip_reply_arg_flowi_flags(arg),
daddr, saddr,
@@ -1649,7 +1653,8 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
arg->csumoffset) = csum_fold(csum_add(nskb->csum,
arg->csum));
nskb->ip_summed = CHECKSUM_NONE;
- nskb->mono_delivery_time = !!transmit_time;
+ if (transmit_time)
+ nskb->tstamp_type = SKB_CLOCK_MONOTONIC;
if (txhash)
skb_set_hash(nskb, txhash, PKT_HASH_TYPE_L4);
ip_push_pending_frames(sk, &fl4);
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 1b8d8ff9a237..d591c73e2c0e 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -43,6 +43,7 @@
#include <net/rtnetlink.h>
#include <net/udp.h>
#include <net/dst_metadata.h>
+#include <net/inet_dscp.h>
#if IS_ENABLED(CONFIG_IPV6)
#include <net/ipv6.h>
@@ -56,17 +57,13 @@ static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
IP_TNL_HASH_BITS);
}
-static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
- __be16 flags, __be32 key)
+static bool ip_tunnel_key_match(const struct ip_tunnel_parm_kern *p,
+ const unsigned long *flags, __be32 key)
{
- if (p->i_flags & TUNNEL_KEY) {
- if (flags & TUNNEL_KEY)
- return key == p->i_key;
- else
- /* key expected, none present */
- return false;
- } else
- return !(flags & TUNNEL_KEY);
+ if (!test_bit(IP_TUNNEL_KEY_BIT, flags))
+ return !test_bit(IP_TUNNEL_KEY_BIT, p->i_flags);
+
+ return test_bit(IP_TUNNEL_KEY_BIT, p->i_flags) && p->i_key == key;
}
/* Fallback tunnel: no source, no destination, no key, no options
@@ -81,7 +78,7 @@ static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
Given src, dst and key, find appropriate for input tunnel.
*/
struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
- int link, __be16 flags,
+ int link, const unsigned long *flags,
__be32 remote, __be32 local,
__be32 key)
{
@@ -143,7 +140,8 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
}
hlist_for_each_entry_rcu(t, head, hash_node) {
- if ((!(flags & TUNNEL_NO_KEY) && t->parms.i_key != key) ||
+ if ((!test_bit(IP_TUNNEL_NO_KEY_BIT, flags) &&
+ t->parms.i_key != key) ||
t->parms.iph.saddr != 0 ||
t->parms.iph.daddr != 0 ||
!(t->dev->flags & IFF_UP))
@@ -171,7 +169,7 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
- struct ip_tunnel_parm *parms)
+ struct ip_tunnel_parm_kern *parms)
{
unsigned int h;
__be32 remote;
@@ -182,7 +180,8 @@ static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
else
remote = 0;
- if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
+ if (!test_bit(IP_TUNNEL_KEY_BIT, parms->i_flags) &&
+ test_bit(IP_TUNNEL_VTI_BIT, parms->i_flags))
i_key = 0;
h = ip_tunnel_hash(i_key, remote);
@@ -206,17 +205,19 @@ static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
}
static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
- struct ip_tunnel_parm *parms,
+ struct ip_tunnel_parm_kern *parms,
int type)
{
__be32 remote = parms->iph.daddr;
__be32 local = parms->iph.saddr;
+ IP_TUNNEL_DECLARE_FLAGS(flags);
__be32 key = parms->i_key;
- __be16 flags = parms->i_flags;
int link = parms->link;
struct ip_tunnel *t = NULL;
struct hlist_head *head = ip_bucket(itn, parms);
+ ip_tunnel_flags_copy(flags, parms->i_flags);
+
hlist_for_each_entry_rcu(t, head, hash_node) {
if (local == t->parms.iph.saddr &&
remote == t->parms.iph.daddr &&
@@ -230,7 +231,7 @@ static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
static struct net_device *__ip_tunnel_create(struct net *net,
const struct rtnl_link_ops *ops,
- struct ip_tunnel_parm *parms)
+ struct ip_tunnel_parm_kern *parms)
{
int err;
struct ip_tunnel *tunnel;
@@ -293,7 +294,7 @@ static int ip_tunnel_bind_dev(struct net_device *dev)
ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr,
iph->saddr, tunnel->parms.o_key,
- RT_TOS(iph->tos), dev_net(dev),
+ iph->tos & INET_DSCP_MASK, dev_net(dev),
tunnel->parms.link, tunnel->fwmark, 0, 0);
rt = ip_route_output_key(tunnel->net, &fl4);
@@ -326,7 +327,7 @@ static int ip_tunnel_bind_dev(struct net_device *dev)
static struct ip_tunnel *ip_tunnel_create(struct net *net,
struct ip_tunnel_net *itn,
- struct ip_tunnel_parm *parms)
+ struct ip_tunnel_parm_kern *parms)
{
struct ip_tunnel *nt;
struct net_device *dev;
@@ -386,15 +387,15 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
}
#endif
- if ((!(tpi->flags&TUNNEL_CSUM) && (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
- ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
+ if (test_bit(IP_TUNNEL_CSUM_BIT, tunnel->parms.i_flags) !=
+ test_bit(IP_TUNNEL_CSUM_BIT, tpi->flags)) {
DEV_STATS_INC(tunnel->dev, rx_crc_errors);
DEV_STATS_INC(tunnel->dev, rx_errors);
goto drop;
}
- if (tunnel->parms.i_flags&TUNNEL_SEQ) {
- if (!(tpi->flags&TUNNEL_SEQ) ||
+ if (test_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.i_flags)) {
+ if (!test_bit(IP_TUNNEL_SEQ_BIT, tpi->flags) ||
(tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
DEV_STATS_INC(tunnel->dev, rx_fifo_errors);
DEV_STATS_INC(tunnel->dev, rx_errors);
@@ -543,7 +544,7 @@ static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
struct rt6_info *rt6;
__be32 daddr;
- rt6 = skb_valid_dst(skb) ? (struct rt6_info *)skb_dst(skb) :
+ rt6 = skb_valid_dst(skb) ? dst_rt6_info(skb_dst(skb)) :
NULL;
daddr = md ? dst : tunnel->parms.iph.daddr;
@@ -609,9 +610,9 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
}
ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src,
- tunnel_id_to_key32(key->tun_id), RT_TOS(tos),
- dev_net(dev), 0, skb->mark, skb_get_hash(skb),
- key->flow_flags);
+ tunnel_id_to_key32(key->tun_id),
+ tos & INET_DSCP_MASK, dev_net(dev), 0, skb->mark,
+ skb_get_hash(skb), key->flow_flags);
if (!tunnel_hlen)
tunnel_hlen = ip_encap_hlen(&tun_info->encap);
@@ -638,7 +639,7 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
goto tx_error;
}
- if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
+ if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, key->tun_flags))
df = htons(IP_DF);
if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, tunnel_hlen,
key->u.ipv4.dst, true)) {
@@ -772,7 +773,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
}
ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr,
- tunnel->parms.o_key, RT_TOS(tos),
+ tunnel->parms.o_key, tos & INET_DSCP_MASK,
dev_net(dev), READ_ONCE(tunnel->parms.link),
tunnel->fwmark, skb_get_hash(skb), 0);
@@ -871,7 +872,7 @@ EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
static void ip_tunnel_update(struct ip_tunnel_net *itn,
struct ip_tunnel *t,
struct net_device *dev,
- struct ip_tunnel_parm *p,
+ struct ip_tunnel_parm_kern *p,
bool set_mtu,
__u32 fwmark)
{
@@ -897,13 +898,14 @@ static void ip_tunnel_update(struct ip_tunnel_net *itn,
t->fwmark = fwmark;
mtu = ip_tunnel_bind_dev(dev);
if (set_mtu)
- dev->mtu = mtu;
+ WRITE_ONCE(dev->mtu, mtu);
}
dst_cache_reset(&t->dst_cache);
netdev_state_change(dev);
}
-int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
+int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p,
+ int cmd)
{
int err = 0;
struct ip_tunnel *t = netdev_priv(dev);
@@ -927,10 +929,10 @@ int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
goto done;
if (p->iph.ttl)
p->iph.frag_off |= htons(IP_DF);
- if (!(p->i_flags & VTI_ISVTI)) {
- if (!(p->i_flags & TUNNEL_KEY))
+ if (!test_bit(IP_TUNNEL_VTI_BIT, p->i_flags)) {
+ if (!test_bit(IP_TUNNEL_KEY_BIT, p->i_flags))
p->i_key = 0;
- if (!(p->o_flags & TUNNEL_KEY))
+ if (!test_bit(IP_TUNNEL_KEY_BIT, p->o_flags))
p->o_key = 0;
}
@@ -1005,16 +1007,58 @@ done:
}
EXPORT_SYMBOL_GPL(ip_tunnel_ctl);
+bool ip_tunnel_parm_from_user(struct ip_tunnel_parm_kern *kp,
+ const void __user *data)
+{
+ struct ip_tunnel_parm p;
+
+ if (copy_from_user(&p, data, sizeof(p)))
+ return false;
+
+ strscpy(kp->name, p.name);
+ kp->link = p.link;
+ ip_tunnel_flags_from_be16(kp->i_flags, p.i_flags);
+ ip_tunnel_flags_from_be16(kp->o_flags, p.o_flags);
+ kp->i_key = p.i_key;
+ kp->o_key = p.o_key;
+ memcpy(&kp->iph, &p.iph, min(sizeof(kp->iph), sizeof(p.iph)));
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_parm_from_user);
+
+bool ip_tunnel_parm_to_user(void __user *data, struct ip_tunnel_parm_kern *kp)
+{
+ struct ip_tunnel_parm p;
+
+ if (!ip_tunnel_flags_is_be16_compat(kp->i_flags) ||
+ !ip_tunnel_flags_is_be16_compat(kp->o_flags))
+ return false;
+
+ memset(&p, 0, sizeof(p));
+
+ strscpy(p.name, kp->name);
+ p.link = kp->link;
+ p.i_flags = ip_tunnel_flags_to_be16(kp->i_flags);
+ p.o_flags = ip_tunnel_flags_to_be16(kp->o_flags);
+ p.i_key = kp->i_key;
+ p.o_key = kp->o_key;
+ memcpy(&p.iph, &kp->iph, min(sizeof(p.iph), sizeof(kp->iph)));
+
+ return !copy_to_user(data, &p, sizeof(p));
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_parm_to_user);
+
int ip_tunnel_siocdevprivate(struct net_device *dev, struct ifreq *ifr,
void __user *data, int cmd)
{
- struct ip_tunnel_parm p;
+ struct ip_tunnel_parm_kern p;
int err;
- if (copy_from_user(&p, data, sizeof(p)))
+ if (!ip_tunnel_parm_from_user(&p, data))
return -EFAULT;
err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, cmd);
- if (!err && copy_to_user(data, &p, sizeof(p)))
+ if (!err && !ip_tunnel_parm_to_user(data, &p))
return -EFAULT;
return err;
}
@@ -1039,7 +1083,7 @@ int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
new_mtu = max_mtu;
}
- dev->mtu = new_mtu;
+ WRITE_ONCE(dev->mtu, new_mtu);
return 0;
}
EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
@@ -1056,7 +1100,6 @@ static void ip_tunnel_dev_free(struct net_device *dev)
gro_cells_destroy(&tunnel->gro_cells);
dst_cache_destroy(&tunnel->dst_cache);
- free_percpu(dev->tstats);
}
void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
@@ -1077,7 +1120,7 @@ struct net *ip_tunnel_get_link_net(const struct net_device *dev)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
- return tunnel->net;
+ return READ_ONCE(tunnel->net);
}
EXPORT_SYMBOL(ip_tunnel_get_link_net);
@@ -1093,7 +1136,7 @@ int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
struct rtnl_link_ops *ops, char *devname)
{
struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
- struct ip_tunnel_parm parms;
+ struct ip_tunnel_parm_kern parms;
unsigned int i;
itn->rtnl_link_ops = ops;
@@ -1119,7 +1162,7 @@ int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
* Allowing to move it to another netns is clearly unsafe.
*/
if (!IS_ERR(itn->fb_tunnel_dev)) {
- itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
+ itn->fb_tunnel_dev->netns_local = true;
itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
itn->type = itn->fb_tunnel_dev->type;
@@ -1171,7 +1214,7 @@ void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id,
EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets);
int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
- struct ip_tunnel_parm *p, __u32 fwmark)
+ struct ip_tunnel_parm_kern *p, __u32 fwmark)
{
struct ip_tunnel *nt;
struct net *net = dev_net(dev);
@@ -1225,7 +1268,7 @@ err_register_netdevice:
EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
- struct ip_tunnel_parm *p, __u32 fwmark)
+ struct ip_tunnel_parm_kern *p, __u32 fwmark)
{
struct ip_tunnel *t;
struct ip_tunnel *tunnel = netdev_priv(dev);
@@ -1270,26 +1313,21 @@ int ip_tunnel_init(struct net_device *dev)
dev->needs_free_netdev = true;
dev->priv_destructor = ip_tunnel_dev_free;
- dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
- if (!dev->tstats)
- return -ENOMEM;
+ dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS;
err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
- if (err) {
- free_percpu(dev->tstats);
+ if (err)
return err;
- }
err = gro_cells_init(&tunnel->gro_cells, dev);
if (err) {
dst_cache_destroy(&tunnel->dst_cache);
- free_percpu(dev->tstats);
return err;
}
tunnel->dev = dev;
tunnel->net = dev_net(dev);
- strcpy(tunnel->parms.name, dev->name);
+ strscpy(tunnel->parms.name, dev->name);
iph->version = 4;
iph->ihl = 5;
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 80ccd6661aa3..a3676155be78 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -125,6 +125,7 @@ EXPORT_SYMBOL_GPL(__iptunnel_pull_header);
struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md,
gfp_t flags)
{
+ IP_TUNNEL_DECLARE_FLAGS(tun_flags) = { };
struct metadata_dst *res;
struct ip_tunnel_info *dst, *src;
@@ -144,10 +145,10 @@ struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md,
sizeof(struct in6_addr));
else
dst->key.u.ipv4.dst = src->key.u.ipv4.src;
- dst->key.tun_flags = src->key.tun_flags;
+ ip_tunnel_flags_copy(dst->key.tun_flags, src->key.tun_flags);
dst->mode = src->mode | IP_TUNNEL_INFO_TX;
ip_tunnel_info_opts_set(dst, ip_tunnel_info_opts(src),
- src->options_len, 0);
+ src->options_len, tun_flags);
return res;
}
@@ -497,7 +498,7 @@ static int ip_tun_parse_opts_geneve(struct nlattr *attr,
opt->opt_class = nla_get_be16(attr);
attr = tb[LWTUNNEL_IP_OPT_GENEVE_TYPE];
opt->type = nla_get_u8(attr);
- info->key.tun_flags |= TUNNEL_GENEVE_OPT;
+ __set_bit(IP_TUNNEL_GENEVE_OPT_BIT, info->key.tun_flags);
}
return sizeof(struct geneve_opt) + data_len;
@@ -525,7 +526,7 @@ static int ip_tun_parse_opts_vxlan(struct nlattr *attr,
attr = tb[LWTUNNEL_IP_OPT_VXLAN_GBP];
md->gbp = nla_get_u32(attr);
md->gbp &= VXLAN_GBP_MASK;
- info->key.tun_flags |= TUNNEL_VXLAN_OPT;
+ __set_bit(IP_TUNNEL_VXLAN_OPT_BIT, info->key.tun_flags);
}
return sizeof(struct vxlan_metadata);
@@ -574,7 +575,7 @@ static int ip_tun_parse_opts_erspan(struct nlattr *attr,
set_hwid(&md->u.md2, nla_get_u8(attr));
}
- info->key.tun_flags |= TUNNEL_ERSPAN_OPT;
+ __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, info->key.tun_flags);
}
return sizeof(struct erspan_metadata);
@@ -585,7 +586,7 @@ static int ip_tun_parse_opts(struct nlattr *attr, struct ip_tunnel_info *info,
{
int err, rem, opt_len, opts_len = 0;
struct nlattr *nla;
- __be16 type = 0;
+ u32 type = 0;
if (!attr)
return 0;
@@ -598,7 +599,7 @@ static int ip_tun_parse_opts(struct nlattr *attr, struct ip_tunnel_info *info,
nla_for_each_attr(nla, nla_data(attr), nla_len(attr), rem) {
switch (nla_type(nla)) {
case LWTUNNEL_IP_OPTS_GENEVE:
- if (type && type != TUNNEL_GENEVE_OPT)
+ if (type && type != IP_TUNNEL_GENEVE_OPT_BIT)
return -EINVAL;
opt_len = ip_tun_parse_opts_geneve(nla, info, opts_len,
extack);
@@ -607,7 +608,7 @@ static int ip_tun_parse_opts(struct nlattr *attr, struct ip_tunnel_info *info,
opts_len += opt_len;
if (opts_len > IP_TUNNEL_OPTS_MAX)
return -EINVAL;
- type = TUNNEL_GENEVE_OPT;
+ type = IP_TUNNEL_GENEVE_OPT_BIT;
break;
case LWTUNNEL_IP_OPTS_VXLAN:
if (type)
@@ -617,7 +618,7 @@ static int ip_tun_parse_opts(struct nlattr *attr, struct ip_tunnel_info *info,
if (opt_len < 0)
return opt_len;
opts_len += opt_len;
- type = TUNNEL_VXLAN_OPT;
+ type = IP_TUNNEL_VXLAN_OPT_BIT;
break;
case LWTUNNEL_IP_OPTS_ERSPAN:
if (type)
@@ -627,7 +628,7 @@ static int ip_tun_parse_opts(struct nlattr *attr, struct ip_tunnel_info *info,
if (opt_len < 0)
return opt_len;
opts_len += opt_len;
- type = TUNNEL_ERSPAN_OPT;
+ type = IP_TUNNEL_ERSPAN_OPT_BIT;
break;
default:
return -EINVAL;
@@ -705,10 +706,16 @@ static int ip_tun_build_state(struct net *net, struct nlattr *attr,
if (tb[LWTUNNEL_IP_TOS])
tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP_TOS]);
- if (tb[LWTUNNEL_IP_FLAGS])
- tun_info->key.tun_flags |=
- (nla_get_be16(tb[LWTUNNEL_IP_FLAGS]) &
- ~TUNNEL_OPTIONS_PRESENT);
+ if (tb[LWTUNNEL_IP_FLAGS]) {
+ IP_TUNNEL_DECLARE_FLAGS(flags);
+
+ ip_tunnel_flags_from_be16(flags,
+ nla_get_be16(tb[LWTUNNEL_IP_FLAGS]));
+ ip_tunnel_clear_options_present(flags);
+
+ ip_tunnel_flags_or(tun_info->key.tun_flags,
+ tun_info->key.tun_flags, flags);
+ }
tun_info->mode = IP_TUNNEL_INFO_TX;
tun_info->options_len = opt_len;
@@ -812,18 +819,18 @@ static int ip_tun_fill_encap_opts(struct sk_buff *skb, int type,
struct nlattr *nest;
int err = 0;
- if (!(tun_info->key.tun_flags & TUNNEL_OPTIONS_PRESENT))
+ if (!ip_tunnel_is_options_present(tun_info->key.tun_flags))
return 0;
nest = nla_nest_start_noflag(skb, type);
if (!nest)
return -ENOMEM;
- if (tun_info->key.tun_flags & TUNNEL_GENEVE_OPT)
+ if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT, tun_info->key.tun_flags))
err = ip_tun_fill_encap_opts_geneve(skb, tun_info);
- else if (tun_info->key.tun_flags & TUNNEL_VXLAN_OPT)
+ else if (test_bit(IP_TUNNEL_VXLAN_OPT_BIT, tun_info->key.tun_flags))
err = ip_tun_fill_encap_opts_vxlan(skb, tun_info);
- else if (tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT)
+ else if (test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, tun_info->key.tun_flags))
err = ip_tun_fill_encap_opts_erspan(skb, tun_info);
if (err) {
@@ -846,7 +853,8 @@ static int ip_tun_fill_encap_info(struct sk_buff *skb,
nla_put_in_addr(skb, LWTUNNEL_IP_SRC, tun_info->key.u.ipv4.src) ||
nla_put_u8(skb, LWTUNNEL_IP_TOS, tun_info->key.tos) ||
nla_put_u8(skb, LWTUNNEL_IP_TTL, tun_info->key.ttl) ||
- nla_put_be16(skb, LWTUNNEL_IP_FLAGS, tun_info->key.tun_flags) ||
+ nla_put_be16(skb, LWTUNNEL_IP_FLAGS,
+ ip_tunnel_flags_to_be16(tun_info->key.tun_flags)) ||
ip_tun_fill_encap_opts(skb, LWTUNNEL_IP_OPTS, tun_info))
return -ENOMEM;
@@ -857,11 +865,11 @@ static int ip_tun_opts_nlsize(struct ip_tunnel_info *info)
{
int opt_len;
- if (!(info->key.tun_flags & TUNNEL_OPTIONS_PRESENT))
+ if (!ip_tunnel_is_options_present(info->key.tun_flags))
return 0;
opt_len = nla_total_size(0); /* LWTUNNEL_IP_OPTS */
- if (info->key.tun_flags & TUNNEL_GENEVE_OPT) {
+ if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT, info->key.tun_flags)) {
struct geneve_opt *opt;
int offset = 0;
@@ -874,10 +882,10 @@ static int ip_tun_opts_nlsize(struct ip_tunnel_info *info)
/* OPT_GENEVE_DATA */
offset += sizeof(*opt) + opt->length * 4;
}
- } else if (info->key.tun_flags & TUNNEL_VXLAN_OPT) {
+ } else if (test_bit(IP_TUNNEL_VXLAN_OPT_BIT, info->key.tun_flags)) {
opt_len += nla_total_size(0) /* LWTUNNEL_IP_OPTS_VXLAN */
+ nla_total_size(4); /* OPT_VXLAN_GBP */
- } else if (info->key.tun_flags & TUNNEL_ERSPAN_OPT) {
+ } else if (test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, info->key.tun_flags)) {
struct erspan_metadata *md = ip_tunnel_info_opts(info);
opt_len += nla_total_size(0) /* LWTUNNEL_IP_OPTS_ERSPAN */
@@ -984,10 +992,17 @@ static int ip6_tun_build_state(struct net *net, struct nlattr *attr,
if (tb[LWTUNNEL_IP6_TC])
tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP6_TC]);
- if (tb[LWTUNNEL_IP6_FLAGS])
- tun_info->key.tun_flags |=
- (nla_get_be16(tb[LWTUNNEL_IP6_FLAGS]) &
- ~TUNNEL_OPTIONS_PRESENT);
+ if (tb[LWTUNNEL_IP6_FLAGS]) {
+ IP_TUNNEL_DECLARE_FLAGS(flags);
+ __be16 data;
+
+ data = nla_get_be16(tb[LWTUNNEL_IP6_FLAGS]);
+ ip_tunnel_flags_from_be16(flags, data);
+ ip_tunnel_clear_options_present(flags);
+
+ ip_tunnel_flags_or(tun_info->key.tun_flags,
+ tun_info->key.tun_flags, flags);
+ }
tun_info->mode = IP_TUNNEL_INFO_TX | IP_TUNNEL_INFO_IPV6;
tun_info->options_len = opt_len;
@@ -1008,7 +1023,8 @@ static int ip6_tun_fill_encap_info(struct sk_buff *skb,
nla_put_in6_addr(skb, LWTUNNEL_IP6_SRC, &tun_info->key.u.ipv6.src) ||
nla_put_u8(skb, LWTUNNEL_IP6_TC, tun_info->key.tos) ||
nla_put_u8(skb, LWTUNNEL_IP6_HOPLIMIT, tun_info->key.ttl) ||
- nla_put_be16(skb, LWTUNNEL_IP6_FLAGS, tun_info->key.tun_flags) ||
+ nla_put_be16(skb, LWTUNNEL_IP6_FLAGS,
+ ip_tunnel_flags_to_be16(tun_info->key.tun_flags)) ||
ip_tun_fill_encap_opts(skb, LWTUNNEL_IP6_OPTS, tun_info))
return -ENOMEM;
@@ -1116,7 +1132,7 @@ bool ip_tunnel_netlink_encap_parms(struct nlattr *data[],
EXPORT_SYMBOL_GPL(ip_tunnel_netlink_encap_parms);
void ip_tunnel_netlink_parms(struct nlattr *data[],
- struct ip_tunnel_parm *parms)
+ struct ip_tunnel_parm_kern *parms)
{
if (data[IFLA_IPTUN_LINK])
parms->link = nla_get_u32(data[IFLA_IPTUN_LINK]);
@@ -1139,8 +1155,12 @@ void ip_tunnel_netlink_parms(struct nlattr *data[],
if (!data[IFLA_IPTUN_PMTUDISC] || nla_get_u8(data[IFLA_IPTUN_PMTUDISC]))
parms->iph.frag_off = htons(IP_DF);
- if (data[IFLA_IPTUN_FLAGS])
- parms->i_flags = nla_get_be16(data[IFLA_IPTUN_FLAGS]);
+ if (data[IFLA_IPTUN_FLAGS]) {
+ __be16 flags;
+
+ flags = nla_get_be16(data[IFLA_IPTUN_FLAGS]);
+ ip_tunnel_flags_from_be16(parms->i_flags, flags);
+ }
if (data[IFLA_IPTUN_PROTO])
parms->iph.protocol = nla_get_u8(data[IFLA_IPTUN_PROTO]);
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index ee587adb169f..f0b4419cef34 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -51,8 +51,11 @@ static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi,
const struct iphdr *iph = ip_hdr(skb);
struct net *net = dev_net(skb->dev);
struct ip_tunnel_net *itn = net_generic(net, vti_net_id);
+ IP_TUNNEL_DECLARE_FLAGS(flags) = { };
- tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
+ __set_bit(IP_TUNNEL_NO_KEY_BIT, flags);
+
+ tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, flags,
iph->saddr, iph->daddr, 0);
if (tunnel) {
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
@@ -167,7 +170,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev,
struct flowi *fl)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
- struct ip_tunnel_parm *parms = &tunnel->parms;
+ struct ip_tunnel_parm_kern *parms = &tunnel->parms;
struct dst_entry *dst = skb_dst(skb);
struct net_device *tdev; /* Device to other host */
int pkt_len = skb->len;
@@ -322,8 +325,11 @@ static int vti4_err(struct sk_buff *skb, u32 info)
const struct iphdr *iph = (const struct iphdr *)skb->data;
int protocol = iph->protocol;
struct ip_tunnel_net *itn = net_generic(net, vti_net_id);
+ IP_TUNNEL_DECLARE_FLAGS(flags) = { };
+
+ __set_bit(IP_TUNNEL_NO_KEY_BIT, flags);
- tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
+ tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, flags,
iph->daddr, iph->saddr, 0);
if (!tunnel)
return -1;
@@ -373,8 +379,9 @@ static int vti4_err(struct sk_buff *skb, u32 info)
}
static int
-vti_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
+vti_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p, int cmd)
{
+ IP_TUNNEL_DECLARE_FLAGS(flags) = { };
int err = 0;
if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) {
@@ -383,20 +390,26 @@ vti_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
return -EINVAL;
}
- if (!(p->i_flags & GRE_KEY))
+ if (!ip_tunnel_flags_is_be16_compat(p->i_flags) ||
+ !ip_tunnel_flags_is_be16_compat(p->o_flags))
+ return -EOVERFLOW;
+
+ if (!(ip_tunnel_flags_to_be16(p->i_flags) & GRE_KEY))
p->i_key = 0;
- if (!(p->o_flags & GRE_KEY))
+ if (!(ip_tunnel_flags_to_be16(p->o_flags) & GRE_KEY))
p->o_key = 0;
- p->i_flags = VTI_ISVTI;
+ __set_bit(IP_TUNNEL_VTI_BIT, flags);
+ ip_tunnel_flags_copy(p->i_flags, flags);
err = ip_tunnel_ctl(dev, p, cmd);
if (err)
return err;
if (cmd != SIOCDELTUNNEL) {
- p->i_flags |= GRE_KEY;
- p->o_flags |= GRE_KEY;
+ ip_tunnel_flags_from_be16(flags, GRE_KEY);
+ ip_tunnel_flags_or(p->i_flags, p->i_flags, flags);
+ ip_tunnel_flags_or(p->o_flags, p->o_flags, flags);
}
return 0;
}
@@ -430,7 +443,7 @@ static int vti_tunnel_init(struct net_device *dev)
dev->flags = IFF_NOARP;
dev->addr_len = 4;
- dev->features |= NETIF_F_LLTX;
+ dev->lltx = true;
netif_keep_dst(dev);
return ip_tunnel_init(dev);
@@ -531,7 +544,7 @@ static int vti_tunnel_validate(struct nlattr *tb[], struct nlattr *data[],
}
static void vti_netlink_parms(struct nlattr *data[],
- struct ip_tunnel_parm *parms,
+ struct ip_tunnel_parm_kern *parms,
__u32 *fwmark)
{
memset(parms, 0, sizeof(*parms));
@@ -541,7 +554,7 @@ static void vti_netlink_parms(struct nlattr *data[],
if (!data)
return;
- parms->i_flags = VTI_ISVTI;
+ __set_bit(IP_TUNNEL_VTI_BIT, parms->i_flags);
if (data[IFLA_VTI_LINK])
parms->link = nla_get_u32(data[IFLA_VTI_LINK]);
@@ -566,7 +579,7 @@ static int vti_newlink(struct net *src_net, struct net_device *dev,
struct nlattr *tb[], struct nlattr *data[],
struct netlink_ext_ack *extack)
{
- struct ip_tunnel_parm parms;
+ struct ip_tunnel_parm_kern parms;
__u32 fwmark = 0;
vti_netlink_parms(data, &parms, &fwmark);
@@ -578,8 +591,8 @@ static int vti_changelink(struct net_device *dev, struct nlattr *tb[],
struct netlink_ext_ack *extack)
{
struct ip_tunnel *t = netdev_priv(dev);
+ struct ip_tunnel_parm_kern p;
__u32 fwmark = t->fwmark;
- struct ip_tunnel_parm p;
vti_netlink_parms(data, &p, &fwmark);
return ip_tunnel_changelink(dev, tb, &p, fwmark);
@@ -606,7 +619,7 @@ static size_t vti_get_size(const struct net_device *dev)
static int vti_fill_info(struct sk_buff *skb, const struct net_device *dev)
{
struct ip_tunnel *t = netdev_priv(dev);
- struct ip_tunnel_parm *p = &t->parms;
+ struct ip_tunnel_parm_kern *p = &t->parms;
if (nla_put_u32(skb, IFLA_VTI_LINK, p->link) ||
nla_put_be32(skb, IFLA_VTI_IKEY, p->i_key) ||
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index f2696eaadbe6..dc0db5895e0e 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -130,13 +130,16 @@ static int ipip_err(struct sk_buff *skb, u32 info)
struct net *net = dev_net(skb->dev);
struct ip_tunnel_net *itn = net_generic(net, ipip_net_id);
const struct iphdr *iph = (const struct iphdr *)skb->data;
+ IP_TUNNEL_DECLARE_FLAGS(flags) = { };
const int type = icmp_hdr(skb)->type;
const int code = icmp_hdr(skb)->code;
struct ip_tunnel *t;
int err = 0;
- t = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
- iph->daddr, iph->saddr, 0);
+ __set_bit(IP_TUNNEL_NO_KEY_BIT, flags);
+
+ t = ip_tunnel_lookup(itn, skb->dev->ifindex, flags, iph->daddr,
+ iph->saddr, 0);
if (!t) {
err = -ENOENT;
goto out;
@@ -213,13 +216,16 @@ static int ipip_tunnel_rcv(struct sk_buff *skb, u8 ipproto)
{
struct net *net = dev_net(skb->dev);
struct ip_tunnel_net *itn = net_generic(net, ipip_net_id);
+ IP_TUNNEL_DECLARE_FLAGS(flags) = { };
struct metadata_dst *tun_dst = NULL;
struct ip_tunnel *tunnel;
const struct iphdr *iph;
+ __set_bit(IP_TUNNEL_NO_KEY_BIT, flags);
+
iph = ip_hdr(skb);
- tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
- iph->saddr, iph->daddr, 0);
+ tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, flags, iph->saddr,
+ iph->daddr, 0);
if (tunnel) {
const struct tnl_ptk_info *tpi;
@@ -238,7 +244,9 @@ static int ipip_tunnel_rcv(struct sk_buff *skb, u8 ipproto)
if (iptunnel_pull_header(skb, 0, tpi->proto, false))
goto drop;
if (tunnel->collect_md) {
- tun_dst = ip_tun_rx_dst(skb, 0, 0, 0);
+ ip_tunnel_flags_zero(flags);
+
+ tun_dst = ip_tun_rx_dst(skb, flags, 0, 0);
if (!tun_dst)
return 0;
ip_tunnel_md_udp_encap(skb, &tun_dst->u.tun_info);
@@ -330,7 +338,7 @@ static bool ipip_tunnel_ioctl_verify_protocol(u8 ipproto)
}
static int
-ipip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
+ipip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p, int cmd)
{
if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) {
if (p->iph.version != 4 ||
@@ -340,7 +348,8 @@ ipip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
}
p->i_key = p->o_key = 0;
- p->i_flags = p->o_flags = 0;
+ ip_tunnel_flags_zero(p->i_flags);
+ ip_tunnel_flags_zero(p->o_flags);
return ip_tunnel_ctl(dev, p, cmd);
}
@@ -369,7 +378,7 @@ static void ipip_tunnel_setup(struct net_device *dev)
dev->type = ARPHRD_TUNNEL;
dev->flags = IFF_NOARP;
dev->addr_len = 4;
- dev->features |= NETIF_F_LLTX;
+ dev->lltx = true;
netif_keep_dst(dev);
dev->features |= IPIP_FEATURES;
@@ -405,8 +414,8 @@ static int ipip_tunnel_validate(struct nlattr *tb[], struct nlattr *data[],
}
static void ipip_netlink_parms(struct nlattr *data[],
- struct ip_tunnel_parm *parms, bool *collect_md,
- __u32 *fwmark)
+ struct ip_tunnel_parm_kern *parms,
+ bool *collect_md, __u32 *fwmark)
{
memset(parms, 0, sizeof(*parms));
@@ -432,8 +441,8 @@ static int ipip_newlink(struct net *src_net, struct net_device *dev,
struct netlink_ext_ack *extack)
{
struct ip_tunnel *t = netdev_priv(dev);
- struct ip_tunnel_parm p;
struct ip_tunnel_encap ipencap;
+ struct ip_tunnel_parm_kern p;
__u32 fwmark = 0;
if (ip_tunnel_netlink_encap_parms(data, &ipencap)) {
@@ -452,8 +461,8 @@ static int ipip_changelink(struct net_device *dev, struct nlattr *tb[],
struct netlink_ext_ack *extack)
{
struct ip_tunnel *t = netdev_priv(dev);
- struct ip_tunnel_parm p;
struct ip_tunnel_encap ipencap;
+ struct ip_tunnel_parm_kern p;
bool collect_md;
__u32 fwmark = t->fwmark;
@@ -510,7 +519,7 @@ static size_t ipip_get_size(const struct net_device *dev)
static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
- struct ip_tunnel_parm *parm = &tunnel->parms;
+ struct ip_tunnel_parm_kern *parm = &tunnel->parms;
if (nla_put_u32(skb, IFLA_IPTUN_LINK, parm->link) ||
nla_put_in_addr(skb, IFLA_IPTUN_LOCAL, parm->iph.saddr) ||
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index fd5c01c8489f..089864c6a35e 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -62,6 +62,7 @@
#include <net/fib_rules.h>
#include <linux/netconf.h>
#include <net/rtnh.h>
+#include <net/inet_dscp.h>
#include <linux/nospec.h>
@@ -441,7 +442,7 @@ static bool ipmr_init_vif_indev(const struct net_device *dev)
static struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v)
{
struct net_device *tunnel_dev, *new_dev;
- struct ip_tunnel_parm p = { };
+ struct ip_tunnel_parm_kern p = { };
int err;
tunnel_dev = __dev_get_by_name(net, "tunl0");
@@ -536,7 +537,7 @@ static void reg_vif_setup(struct net_device *dev)
dev->flags = IFF_NOARP;
dev->netdev_ops = &reg_vif_netdev_ops;
dev->needs_free_netdev = true;
- dev->features |= NETIF_F_NETNS_LOCAL;
+ dev->netns_local = true;
}
static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
@@ -1868,7 +1869,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
vif->remote, vif->local,
0, 0,
IPPROTO_IPIP,
- RT_TOS(iph->tos), vif->link);
+ iph->tos & INET_DSCP_MASK, vif->link);
if (IS_ERR(rt))
goto out_free;
encap = sizeof(struct iphdr);
@@ -1876,7 +1877,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
rt = ip_route_output_ports(net, &fl4, NULL, iph->daddr, 0,
0, 0,
IPPROTO_IPIP,
- RT_TOS(iph->tos), vif->link);
+ iph->tos & INET_DSCP_MASK, vif->link);
if (IS_ERR(rt))
goto out_free;
}
@@ -2080,7 +2081,7 @@ static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb)
struct flowi4 fl4 = {
.daddr = iph->daddr,
.saddr = iph->saddr,
- .flowi4_tos = RT_TOS(iph->tos),
+ .flowi4_tos = iph->tos & INET_DSCP_MASK,
.flowi4_oif = (rt_is_output_route(rt) ?
skb->dev->ifindex : 0),
.flowi4_iif = (rt_is_output_route(rt) ?
@@ -2406,8 +2407,7 @@ static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc,
errout:
kfree_skb(skb);
- if (err < 0)
- rtnl_set_sk_err(net, RTNLGRP_IPV4_MROUTE, err);
+ rtnl_set_sk_err(net, RTNLGRP_IPV4_MROUTE, err);
}
static size_t igmpmsg_netlink_msgsize(size_t payloadlen)
diff --git a/net/ipv4/metrics.c b/net/ipv4/metrics.c
index 0e3ee1532848..8ddac1f595ed 100644
--- a/net/ipv4/metrics.c
+++ b/net/ipv4/metrics.c
@@ -7,7 +7,7 @@
#include <net/net_namespace.h>
#include <net/tcp.h>
-static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx,
+static int ip_metrics_convert(struct nlattr *fc_mx,
int fc_mx_len, u32 *metrics,
struct netlink_ext_ack *extack)
{
@@ -31,7 +31,7 @@ static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx,
char tmp[TCP_CA_NAME_MAX];
nla_strscpy(tmp, nla, sizeof(tmp));
- val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca);
+ val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
if (val == TCP_CA_UNSPEC) {
NL_SET_ERR_MSG(extack, "Unknown tcp congestion algorithm");
return -EINVAL;
@@ -63,7 +63,7 @@ static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx,
return 0;
}
-struct dst_metrics *ip_fib_metrics_init(struct net *net, struct nlattr *fc_mx,
+struct dst_metrics *ip_fib_metrics_init(struct nlattr *fc_mx,
int fc_mx_len,
struct netlink_ext_ack *extack)
{
@@ -77,7 +77,7 @@ struct dst_metrics *ip_fib_metrics_init(struct net *net, struct nlattr *fc_mx,
if (unlikely(!fib_metrics))
return ERR_PTR(-ENOMEM);
- err = ip_metrics_convert(net, fc_mx, fc_mx_len, fib_metrics->metrics,
+ err = ip_metrics_convert(fc_mx, fc_mx_len, fib_metrics->metrics,
extack);
if (!err) {
refcount_set(&fib_metrics->refcnt, 1);
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index 591a2737808e..e0aab66cd925 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -14,6 +14,7 @@
#include <net/route.h>
#include <net/xfrm.h>
#include <net/ip.h>
+#include <net/inet_dscp.h>
#include <net/netfilter/nf_queue.h>
/* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */
@@ -43,7 +44,7 @@ int ip_route_me_harder(struct net *net, struct sock *sk, struct sk_buff *skb, un
*/
fl4.daddr = iph->daddr;
fl4.saddr = saddr;
- fl4.flowi4_tos = RT_TOS(iph->tos);
+ fl4.flowi4_tos = iph->tos & INET_DSCP_MASK;
fl4.flowi4_oif = sk ? sk->sk_bound_dev_if : 0;
fl4.flowi4_l3mdev = l3mdev_master_ifindex(dev);
fl4.flowi4_mark = skb->mark;
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 14365b20f1c5..1cdd9c28ab2d 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -826,7 +826,7 @@ static int get_info(struct net *net, void __user *user, const int *len)
sizeof(info.underflow));
info.num_entries = private->number;
info.size = private->size;
- strcpy(info.name, name);
+ strscpy(info.name, name);
if (copy_to_user(user, &info, *len) != 0)
ret = -EFAULT;
@@ -1547,7 +1547,7 @@ int arpt_register_table(struct net *net,
goto out_free;
}
- ops = kmemdup(template_ops, sizeof(*ops) * num_ops, GFP_KERNEL);
+ ops = kmemdup_array(template_ops, num_ops, sizeof(*ops), GFP_KERNEL);
if (!ops) {
ret = -ENOMEM;
goto out_free;
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index fe89a056eb06..3d101613f27f 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -981,7 +981,7 @@ static int get_info(struct net *net, void __user *user, const int *len)
sizeof(info.underflow));
info.num_entries = private->number;
info.size = private->size;
- strcpy(info.name, name);
+ strscpy(info.name, name);
if (copy_to_user(user, &info, *len) != 0)
ret = -EFAULT;
@@ -1767,7 +1767,7 @@ int ipt_register_table(struct net *net, const struct xt_table *table,
goto out_free;
}
- ops = kmemdup(template_ops, sizeof(*ops) * num_ops, GFP_KERNEL);
+ ops = kmemdup_array(template_ops, num_ops, sizeof(*ops), GFP_KERNEL);
if (!ops) {
ret = -ENOMEM;
goto out_free;
diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c
index ded5bef02f77..1ce7a1655b97 100644
--- a/net/ipv4/netfilter/ipt_rpfilter.c
+++ b/net/ipv4/netfilter/ipt_rpfilter.c
@@ -8,6 +8,7 @@
#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
+#include <net/inet_dscp.h>
#include <linux/ip.h>
#include <net/ip.h>
#include <net/ip_fib.h>
@@ -75,7 +76,7 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par)
flow.daddr = iph->saddr;
flow.saddr = rpfilter_get_saddr(iph->daddr);
flow.flowi4_mark = info->flags & XT_RPFILTER_VALID_MARK ? skb->mark : 0;
- flow.flowi4_tos = iph->tos & IPTOS_RT_MASK;
+ flow.flowi4_tos = iph->tos & INET_DSCP_MASK;
flow.flowi4_scope = RT_SCOPE_UNIVERSE;
flow.flowi4_l3mdev = l3mdev_master_ifindex_rcu(xt_in(par));
flow.flowi4_uid = sock_net_uid(xt_net(par), NULL);
diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c
index b9062f4552ac..3ab908b74795 100644
--- a/net/ipv4/netfilter/iptable_filter.c
+++ b/net/ipv4/netfilter/iptable_filter.c
@@ -44,7 +44,7 @@ static int iptable_filter_table_init(struct net *net)
return -ENOMEM;
/* Entry 1 is the FORWARD hook */
((struct ipt_standard *)repl->entries)[1].target.verdict =
- forward ? -NF_ACCEPT - 1 : -NF_DROP - 1;
+ forward ? -NF_ACCEPT - 1 : NF_DROP - 1;
err = ipt_register_table(net, &packet_filter, repl, filter_ops);
kfree(repl);
diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c
index 4d42d0756fd7..a5db7c67d61b 100644
--- a/net/ipv4/netfilter/iptable_nat.c
+++ b/net/ipv4/netfilter/iptable_nat.c
@@ -145,25 +145,27 @@ static struct pernet_operations iptable_nat_net_ops = {
static int __init iptable_nat_init(void)
{
- int ret = xt_register_template(&nf_nat_ipv4_table,
- iptable_nat_table_init);
+ int ret;
+ /* net->gen->ptr[iptable_nat_net_id] must be allocated
+ * before calling iptable_nat_table_init().
+ */
+ ret = register_pernet_subsys(&iptable_nat_net_ops);
if (ret < 0)
return ret;
- ret = register_pernet_subsys(&iptable_nat_net_ops);
- if (ret < 0) {
- xt_unregister_template(&nf_nat_ipv4_table);
- return ret;
- }
+ ret = xt_register_template(&nf_nat_ipv4_table,
+ iptable_nat_table_init);
+ if (ret < 0)
+ unregister_pernet_subsys(&iptable_nat_net_ops);
return ret;
}
static void __exit iptable_nat_exit(void)
{
- unregister_pernet_subsys(&iptable_nat_net_ops);
xt_unregister_template(&nf_nat_ipv4_table);
+ unregister_pernet_subsys(&iptable_nat_net_ops);
}
module_init(iptable_nat_init);
diff --git a/net/ipv4/netfilter/nf_dup_ipv4.c b/net/ipv4/netfilter/nf_dup_ipv4.c
index 6cc5743c553a..f4aed0789d69 100644
--- a/net/ipv4/netfilter/nf_dup_ipv4.c
+++ b/net/ipv4/netfilter/nf_dup_ipv4.c
@@ -15,6 +15,7 @@
#include <net/icmp.h>
#include <net/ip.h>
#include <net/route.h>
+#include <net/inet_dscp.h>
#include <net/netfilter/ipv4/nf_dup_ipv4.h>
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
#include <net/netfilter/nf_conntrack.h>
@@ -32,7 +33,7 @@ static bool nf_dup_ipv4_route(struct net *net, struct sk_buff *skb,
fl4.flowi4_oif = oif;
fl4.daddr = gw->s_addr;
- fl4.flowi4_tos = RT_TOS(iph->tos);
+ fl4.flowi4_tos = iph->tos & INET_DSCP_MASK;
fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
fl4.flowi4_flags = FLOWI_FLAG_KNOWN_NH;
rt = ip_route_output_key(net, &fl4);
diff --git a/net/ipv4/netfilter/nf_tproxy_ipv4.c b/net/ipv4/netfilter/nf_tproxy_ipv4.c
index 69e331799604..73e66a088e25 100644
--- a/net/ipv4/netfilter/nf_tproxy_ipv4.c
+++ b/net/ipv4/netfilter/nf_tproxy_ipv4.c
@@ -58,6 +58,8 @@ __be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr)
laddr = 0;
indev = __in_dev_get_rcu(skb->dev);
+ if (!indev)
+ return daddr;
in_dev_for_each_ifa_rcu(ifa, indev) {
if (ifa->ifa_flags & IFA_F_SECONDARY)
diff --git a/net/ipv4/netfilter/nft_dup_ipv4.c b/net/ipv4/netfilter/nft_dup_ipv4.c
index a522c3a3be52..ef5dd88107dd 100644
--- a/net/ipv4/netfilter/nft_dup_ipv4.c
+++ b/net/ipv4/netfilter/nft_dup_ipv4.c
@@ -40,13 +40,13 @@ static int nft_dup_ipv4_init(const struct nft_ctx *ctx,
if (tb[NFTA_DUP_SREG_ADDR] == NULL)
return -EINVAL;
- err = nft_parse_register_load(tb[NFTA_DUP_SREG_ADDR], &priv->sreg_addr,
+ err = nft_parse_register_load(ctx, tb[NFTA_DUP_SREG_ADDR], &priv->sreg_addr,
sizeof(struct in_addr));
if (err < 0)
return err;
if (tb[NFTA_DUP_SREG_DEV])
- err = nft_parse_register_load(tb[NFTA_DUP_SREG_DEV],
+ err = nft_parse_register_load(ctx, tb[NFTA_DUP_SREG_DEV],
&priv->sreg_dev, sizeof(int));
return err;
diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c
index 9eee535c64dd..00da1332bbf1 100644
--- a/net/ipv4/netfilter/nft_fib_ipv4.c
+++ b/net/ipv4/netfilter/nft_fib_ipv4.c
@@ -10,6 +10,7 @@
#include <net/netfilter/nf_tables.h>
#include <net/netfilter/nft_fib.h>
+#include <net/inet_dscp.h>
#include <net/ip_fib.h>
#include <net/route.h>
@@ -22,8 +23,6 @@ static __be32 get_saddr(__be32 addr)
return addr;
}
-#define DSCP_BITS 0xfc
-
void nft_fib4_eval_type(const struct nft_expr *expr, struct nft_regs *regs,
const struct nft_pktinfo *pkt)
{
@@ -110,7 +109,7 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs,
if (priv->flags & NFTA_FIB_F_MARK)
fl4.flowi4_mark = pkt->skb->mark;
- fl4.flowi4_tos = iph->tos & DSCP_BITS;
+ fl4.flowi4_tos = iph->tos & INET_DSCP_MASK;
if (priv->flags & NFTA_FIB_F_DADDR) {
fl4.daddr = iph->daddr;
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index 535856b0f0ed..93aaea0006ba 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -865,15 +865,18 @@ out:
}
static int nla_put_nh_group(struct sk_buff *skb, struct nexthop *nh,
- u32 op_flags)
+ u32 op_flags, u32 *resp_op_flags)
{
struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
struct nexthop_grp *p;
size_t len = nhg->num_nh * sizeof(*p);
struct nlattr *nla;
u16 group_type = 0;
+ u16 weight;
int i;
+ *resp_op_flags |= NHA_OP_FLAG_RESP_GRP_RESVD_0;
+
if (nhg->hash_threshold)
group_type = NEXTHOP_GRP_TYPE_MPATH;
else if (nhg->resilient)
@@ -888,9 +891,13 @@ static int nla_put_nh_group(struct sk_buff *skb, struct nexthop *nh,
p = nla_data(nla);
for (i = 0; i < nhg->num_nh; ++i) {
- p->id = nhg->nh_entries[i].nh->id;
- p->weight = nhg->nh_entries[i].weight - 1;
- p += 1;
+ weight = nhg->nh_entries[i].weight - 1;
+
+ *p++ = (struct nexthop_grp) {
+ .id = nhg->nh_entries[i].nh->id,
+ .weight = weight,
+ .weight_high = weight >> 8,
+ };
}
if (nhg->resilient && nla_put_nh_group_res(skb, nhg))
@@ -933,10 +940,12 @@ static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh,
if (nh->is_group) {
struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
+ u32 resp_op_flags = 0;
if (nhg->fdb_nh && nla_put_flag(skb, NHA_FDB))
goto nla_put_failure;
- if (nla_put_nh_group(skb, nh, op_flags))
+ if (nla_put_nh_group(skb, nh, op_flags, &resp_op_flags) ||
+ nla_put_u32(skb, NHA_OP_FLAGS, resp_op_flags))
goto nla_put_failure;
goto out;
}
@@ -1049,7 +1058,9 @@ static size_t nh_nlmsg_size(struct nexthop *nh)
sz += nla_total_size(4); /* NHA_ID */
if (nh->is_group)
- sz += nh_nlmsg_size_grp(nh);
+ sz += nh_nlmsg_size_grp(nh) +
+ nla_total_size(4) + /* NHA_OP_FLAGS */
+ 0;
else
sz += nh_nlmsg_size_single(nh);
@@ -1079,8 +1090,7 @@ static void nexthop_notify(int event, struct nexthop *nh, struct nl_info *info)
info->nlh, gfp_any());
return;
errout:
- if (err < 0)
- rtnl_set_sk_err(info->nl_net, RTNLGRP_NEXTHOP, err);
+ rtnl_set_sk_err(info->nl_net, RTNLGRP_NEXTHOP, err);
}
static unsigned long nh_res_bucket_used_time(const struct nh_res_bucket *bucket)
@@ -1200,8 +1210,7 @@ static void nexthop_bucket_notify(struct nh_res_table *res_table,
rtnl_notify(skb, nh->net, 0, RTNLGRP_NEXTHOP, NULL, GFP_KERNEL);
return;
errout:
- if (err < 0)
- rtnl_set_sk_err(nh->net, RTNLGRP_NEXTHOP, err);
+ rtnl_set_sk_err(nh->net, RTNLGRP_NEXTHOP, err);
}
static bool valid_group_nh(struct nexthop *nh, unsigned int npaths,
@@ -1279,11 +1288,14 @@ static int nh_check_attr_group(struct net *net,
nhg = nla_data(tb[NHA_GROUP]);
for (i = 0; i < len; ++i) {
- if (nhg[i].resvd1 || nhg[i].resvd2) {
- NL_SET_ERR_MSG(extack, "Reserved fields in nexthop_grp must be 0");
+ if (nhg[i].resvd2) {
+ NL_SET_ERR_MSG(extack, "Reserved field in nexthop_grp must be 0");
return -EINVAL;
}
- if (nhg[i].weight > 254) {
+ if (nexthop_grp_weight(&nhg[i]) == 0) {
+ /* 0xffff got passed in, representing weight of 0x10000,
+ * which is too heavy.
+ */
NL_SET_ERR_MSG(extack, "Invalid value for weight");
return -EINVAL;
}
@@ -1879,9 +1891,9 @@ static void nh_res_table_cancel_upkeep(struct nh_res_table *res_table)
static void nh_res_group_rebalance(struct nh_group *nhg,
struct nh_res_table *res_table)
{
- int prev_upper_bound = 0;
- int total = 0;
- int w = 0;
+ u16 prev_upper_bound = 0;
+ u32 total = 0;
+ u32 w = 0;
int i;
INIT_LIST_HEAD(&res_table->uw_nh_entries);
@@ -1891,11 +1903,12 @@ static void nh_res_group_rebalance(struct nh_group *nhg,
for (i = 0; i < nhg->num_nh; ++i) {
struct nh_grp_entry *nhge = &nhg->nh_entries[i];
- int upper_bound;
+ u16 upper_bound;
+ u64 btw;
w += nhge->weight;
- upper_bound = DIV_ROUND_CLOSEST(res_table->num_nh_buckets * w,
- total);
+ btw = ((u64)res_table->num_nh_buckets) * w;
+ upper_bound = DIV_ROUND_CLOSEST_ULL(btw, total);
nhge->res.wants_buckets = upper_bound - prev_upper_bound;
prev_upper_bound = upper_bound;
@@ -1961,8 +1974,8 @@ static void replace_nexthop_grp_res(struct nh_group *oldg,
static void nh_hthr_group_rebalance(struct nh_group *nhg)
{
- int total = 0;
- int w = 0;
+ u32 total = 0;
+ u32 w = 0;
int i;
for (i = 0; i < nhg->num_nh; ++i)
@@ -1970,7 +1983,7 @@ static void nh_hthr_group_rebalance(struct nh_group *nhg)
for (i = 0; i < nhg->num_nh; ++i) {
struct nh_grp_entry *nhge = &nhg->nh_entries[i];
- int upper_bound;
+ u32 upper_bound;
w += nhge->weight;
upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31, total) - 1;
@@ -2712,7 +2725,8 @@ static struct nexthop *nexthop_create_group(struct net *net,
goto out_no_nh;
}
nhg->nh_entries[i].nh = nhe;
- nhg->nh_entries[i].weight = entry[i].weight + 1;
+ nhg->nh_entries[i].weight = nexthop_grp_weight(&entry[i]);
+
list_add(&nhg->nh_entries[i].nh_list, &nhe->grp_list);
nhg->nh_entries[i].nh_parent = nh;
}
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 823306487a82..619ddc087957 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -946,7 +946,7 @@ static enum skb_drop_reason __ping_queue_rcv_skb(struct sock *sk,
pr_debug("ping_queue_rcv_skb(sk=%p,sk->num=%d,skb=%p)\n",
inet_sk(sk), inet_sk(sk)->inet_num, skb);
if (sock_queue_rcv_skb_reason(sk, skb, &reason) < 0) {
- kfree_skb_reason(skb, reason);
+ sk_skb_reason_drop(sk, skb, reason);
pr_debug("ping_queue_rcv_skb -> failed\n");
return reason;
}
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 914bc9c35cc7..40053a02bae1 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -33,6 +33,7 @@
#include <net/protocol.h>
#include <net/tcp.h>
#include <net/mptcp.h>
+#include <net/proto_memory.h>
#include <net/udp.h>
#include <net/udplite.h>
#include <linux/bottom_half.h>
@@ -43,7 +44,7 @@
#include <net/sock.h>
#include <net/raw.h>
-#define TCPUDP_MIB_MAX max_t(u32, UDP_MIB_MAX, TCP_MIB_MAX)
+#define TCPUDP_MIB_MAX MAX_T(u32, UDP_MIB_MAX, TCP_MIB_MAX)
/*
* Report socket allocation statistics [mea@utu.fi]
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index dcb11f22cbf2..474dfd263c8b 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -301,7 +301,7 @@ static int raw_rcv_skb(struct sock *sk, struct sk_buff *skb)
ipv4_pktinfo_prepare(sk, skb, true);
if (sock_queue_rcv_skb_reason(sk, skb, &reason) < 0) {
- kfree_skb_reason(skb, reason);
+ sk_skb_reason_drop(sk, skb, reason);
return NET_RX_DROP;
}
@@ -312,7 +312,7 @@ int raw_rcv(struct sock *sk, struct sk_buff *skb)
{
if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
atomic_inc(&sk->sk_drops);
- kfree_skb_reason(skb, SKB_DROP_REASON_XFRM_POLICY);
+ sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_XFRM_POLICY);
return NET_RX_DROP;
}
nf_reset_ct(skb);
@@ -360,7 +360,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
skb->protocol = htons(ETH_P_IP);
skb->priority = READ_ONCE(sk->sk_priority);
skb->mark = sockc->mark;
- skb->tstamp = sockc->transmit_time;
+ skb_set_delivery_type_by_clockid(skb, sockc->transmit_time, sk->sk_clockid);
skb_dst_set(skb, &rt->dst);
*rtp = NULL;
@@ -612,6 +612,9 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
(hdrincl ? FLOWI_FLAG_KNOWN_NH : 0),
daddr, saddr, 0, 0, sk->sk_uid);
+ fl4.fl4_icmp_type = 0;
+ fl4.fl4_icmp_code = 0;
+
if (!hdrincl) {
rfv.msg = msg;
rfv.hlen = 0;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index d36ace160d42..723ac9181558 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -106,9 +106,6 @@
#include "fib_lookup.h"
-#define RT_FL_TOS(oldflp4) \
- ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
-
#define RT_GC_TIMEOUT (300*HZ)
#define DEFAULT_MIN_PMTU (512 + 20 + 20)
@@ -132,7 +129,8 @@ struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
INDIRECT_CALLABLE_SCOPE
unsigned int ipv4_mtu(const struct dst_entry *dst);
-static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
+static void ipv4_negative_advice(struct sock *sk,
+ struct dst_entry *dst);
static void ipv4_link_failure(struct sk_buff *skb);
static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
struct sk_buff *skb, u32 mtu,
@@ -498,15 +496,6 @@ void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
}
EXPORT_SYMBOL(__ip_select_ident);
-static void ip_rt_fix_tos(struct flowi4 *fl4)
-{
- __u8 tos = RT_FL_TOS(fl4);
-
- fl4->flowi4_tos = tos & IPTOS_RT_MASK;
- if (tos & RTO_ONLINK)
- fl4->flowi4_scope = RT_SCOPE_LINK;
-}
-
static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
const struct sock *sk, const struct iphdr *iph,
int oif, __u8 tos, u8 prot, u32 mark,
@@ -523,7 +512,7 @@ static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
sk->sk_protocol;
}
- flowi4_init_output(fl4, oif, mark, tos & IPTOS_RT_MASK, scope,
+ flowi4_init_output(fl4, oif, mark, tos & INET_DSCP_MASK, scope,
prot, flow_flags, iph->daddr, iph->saddr, 0, 0,
sock_net_uid(net, sk));
}
@@ -552,7 +541,7 @@ static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
if (inet_opt && inet_opt->opt.srr)
daddr = inet_opt->opt.faddr;
flowi4_init_output(fl4, sk->sk_bound_dev_if, READ_ONCE(sk->sk_mark),
- ip_sock_rt_tos(sk) & IPTOS_RT_MASK,
+ ip_sock_rt_tos(sk),
ip_sock_rt_scope(sk),
inet_test_bit(HDRINCL, sk) ?
IPPROTO_RAW : sk->sk_protocol,
@@ -831,28 +820,21 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf
u32 mark = skb->mark;
__u8 tos = iph->tos;
- rt = (struct rtable *) dst;
+ rt = dst_rtable(dst);
__build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
__ip_do_redirect(rt, skb, &fl4, true);
}
-static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
+static void ipv4_negative_advice(struct sock *sk,
+ struct dst_entry *dst)
{
- struct rtable *rt = (struct rtable *)dst;
- struct dst_entry *ret = dst;
+ struct rtable *rt = dst_rtable(dst);
- if (rt) {
- if (dst->obsolete > 0) {
- ip_rt_put(rt);
- ret = NULL;
- } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
- rt->dst.expires) {
- ip_rt_put(rt);
- ret = NULL;
- }
- }
- return ret;
+ if ((dst->obsolete > 0) ||
+ (rt->rt_flags & RTCF_REDIRECTED) ||
+ rt->dst.expires)
+ sk_dst_reset(sk);
}
/*
@@ -1056,7 +1038,7 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
struct sk_buff *skb, u32 mtu,
bool confirm_neigh)
{
- struct rtable *rt = (struct rtable *) dst;
+ struct rtable *rt = dst_rtable(dst);
struct flowi4 fl4;
ip_rt_build_flow_key(&fl4, sk, skb);
@@ -1127,7 +1109,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
- rt = (struct rtable *)odst;
+ rt = dst_rtable(odst);
if (odst->obsolete && !odst->ops->check(odst, 0)) {
rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
if (IS_ERR(rt))
@@ -1136,7 +1118,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
new = true;
}
- __ip_rt_update_pmtu((struct rtable *)xfrm_dst_path(&rt->dst), &fl4, mtu);
+ __ip_rt_update_pmtu(dst_rtable(xfrm_dst_path(&rt->dst)), &fl4, mtu);
if (!dst_check(&rt->dst, 0)) {
if (new)
@@ -1193,7 +1175,7 @@ EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
INDIRECT_CALLABLE_SCOPE struct dst_entry *ipv4_dst_check(struct dst_entry *dst,
u32 cookie)
{
- struct rtable *rt = (struct rtable *) dst;
+ struct rtable *rt = dst_rtable(dst);
/* All IPV4 dsts are created with ->obsolete set to the value
* DST_OBSOLETE_FORCE_CHK which forces validation calls down
@@ -1281,7 +1263,7 @@ void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
struct flowi4 fl4 = {
.daddr = iph->daddr,
.saddr = iph->saddr,
- .flowi4_tos = RT_TOS(iph->tos),
+ .flowi4_tos = iph->tos & INET_DSCP_MASK,
.flowi4_oif = rt->dst.dev->ifindex,
.flowi4_iif = skb->dev->ifindex,
.flowi4_mark = skb->mark,
@@ -1499,7 +1481,6 @@ static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
struct uncached_list {
spinlock_t lock;
struct list_head head;
- struct list_head quarantine;
};
static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
@@ -1528,10 +1509,8 @@ void rt_del_uncached_list(struct rtable *rt)
static void ipv4_dst_destroy(struct dst_entry *dst)
{
- struct rtable *rt = (struct rtable *)dst;
-
ip_dst_metrics_put(dst);
- rt_del_uncached_list(rt);
+ rt_del_uncached_list(dst_rtable(dst));
}
void rt_flush_dev(struct net_device *dev)
@@ -1552,7 +1531,7 @@ void rt_flush_dev(struct net_device *dev)
rt->dst.dev = blackhole_netdev;
netdev_ref_replace(dev, blackhole_netdev,
&rt->dst.dev_tracker, GFP_ATOMIC);
- list_move(&rt->dst.rt_uncached, &ul->quarantine);
+ list_del_init(&rt->dst.rt_uncached);
}
spin_unlock_bh(&ul->lock);
}
@@ -1944,7 +1923,7 @@ static u32 fib_multipath_custom_hash_outer(const struct net *net,
hash_keys.ports.dst = keys.ports.dst;
*p_has_inner = !!(keys.control.flags & FLOW_DIS_ENCAPSULATION);
- return flow_hash_from_keys(&hash_keys);
+ return fib_multipath_hash_from_keys(net, &hash_keys);
}
static u32 fib_multipath_custom_hash_inner(const struct net *net,
@@ -1993,7 +1972,7 @@ static u32 fib_multipath_custom_hash_inner(const struct net *net,
if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT)
hash_keys.ports.dst = keys.ports.dst;
- return flow_hash_from_keys(&hash_keys);
+ return fib_multipath_hash_from_keys(net, &hash_keys);
}
static u32 fib_multipath_custom_hash_skb(const struct net *net,
@@ -2030,7 +2009,7 @@ static u32 fib_multipath_custom_hash_fl4(const struct net *net,
if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT)
hash_keys.ports.dst = fl4->fl4_dport;
- return flow_hash_from_keys(&hash_keys);
+ return fib_multipath_hash_from_keys(net, &hash_keys);
}
/* if skb is set it will be used and fl4 can be NULL */
@@ -2051,7 +2030,7 @@ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
hash_keys.addrs.v4addrs.src = fl4->saddr;
hash_keys.addrs.v4addrs.dst = fl4->daddr;
}
- mhash = flow_hash_from_keys(&hash_keys);
+ mhash = fib_multipath_hash_from_keys(net, &hash_keys);
break;
case 1:
/* skb is currently provided only when forwarding */
@@ -2085,7 +2064,7 @@ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
hash_keys.ports.dst = fl4->fl4_dport;
hash_keys.basic.ip_proto = fl4->flowi4_proto;
}
- mhash = flow_hash_from_keys(&hash_keys);
+ mhash = fib_multipath_hash_from_keys(net, &hash_keys);
break;
case 2:
memset(&hash_keys, 0, sizeof(hash_keys));
@@ -2116,7 +2095,7 @@ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
hash_keys.addrs.v4addrs.src = fl4->saddr;
hash_keys.addrs.v4addrs.dst = fl4->daddr;
}
- mhash = flow_hash_from_keys(&hash_keys);
+ mhash = fib_multipath_hash_from_keys(net, &hash_keys);
break;
case 3:
if (skb)
@@ -2166,6 +2145,9 @@ int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
int err = -EINVAL;
u32 tag = 0;
+ if (!in_dev)
+ return -EINVAL;
+
if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
goto martian_source;
@@ -2178,7 +2160,7 @@ int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
if (rt->rt_type != RTN_LOCAL)
goto skip_validate_source;
- tos &= IPTOS_RT_MASK;
+ tos &= INET_DSCP_MASK;
err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag);
if (err < 0)
goto martian_source;
@@ -2488,7 +2470,7 @@ int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
struct fib_result res;
int err;
- tos &= IPTOS_RT_MASK;
+ tos &= INET_DSCP_MASK;
rcu_read_lock();
err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
rcu_read_unlock();
@@ -2636,7 +2618,7 @@ struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
struct rtable *rth;
fl4->flowi4_iif = LOOPBACK_IFINDEX;
- ip_rt_fix_tos(fl4);
+ fl4->flowi4_tos &= INET_DSCP_MASK;
rcu_read_lock();
rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
@@ -2829,7 +2811,7 @@ static struct dst_ops ipv4_dst_blackhole_ops = {
struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
{
- struct rtable *ort = (struct rtable *) dst_orig;
+ struct rtable *ort = dst_rtable(dst_orig);
struct rtable *rt;
rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, DST_OBSOLETE_DEAD, 0);
@@ -2874,9 +2856,9 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
if (flp4->flowi4_proto) {
flp4->flowi4_oif = rt->dst.dev->ifindex;
- rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
- flowi4_to_flowi(flp4),
- sk, 0);
+ rt = dst_rtable(xfrm_lookup_route(net, &rt->dst,
+ flowi4_to_flowi(flp4),
+ sk, 0));
}
return rt;
@@ -2885,9 +2867,9 @@ EXPORT_SYMBOL_GPL(ip_route_output_flow);
/* called with rcu_read_lock held */
static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
- struct rtable *rt, u32 table_id, struct flowi4 *fl4,
- struct sk_buff *skb, u32 portid, u32 seq,
- unsigned int flags)
+ struct rtable *rt, u32 table_id, dscp_t dscp,
+ struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
+ u32 seq, unsigned int flags)
{
struct rtmsg *r;
struct nlmsghdr *nlh;
@@ -2903,7 +2885,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
r->rtm_family = AF_INET;
r->rtm_dst_len = 32;
r->rtm_src_len = 0;
- r->rtm_tos = fl4 ? fl4->flowi4_tos : 0;
+ r->rtm_tos = inet_dscp_to_dsfield(dscp);
r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT;
if (nla_put_u32(skb, RTA_TABLE, table_id))
goto nla_put_failure;
@@ -3053,7 +3035,7 @@ static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
goto next;
err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
- table_id, NULL, skb,
+ table_id, 0, NULL, skb,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, flags);
if (err)
@@ -3279,7 +3261,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
fl4.daddr = dst;
fl4.saddr = src;
- fl4.flowi4_tos = rtm->rtm_tos & IPTOS_RT_MASK;
+ fl4.flowi4_tos = rtm->rtm_tos & INET_DSCP_MASK;
fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
fl4.flowi4_mark = mark;
fl4.flowi4_uid = uid;
@@ -3304,7 +3286,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
skb->dev = dev;
skb->mark = mark;
err = ip_route_input_rcu(skb, dst, src,
- rtm->rtm_tos & IPTOS_RT_MASK, dev,
+ rtm->rtm_tos & INET_DSCP_MASK, dev,
&res);
rt = skb_rtable(skb);
@@ -3349,7 +3331,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
fri.tb_id = table_id;
fri.dst = res.prefix;
fri.dst_len = res.prefixlen;
- fri.dscp = inet_dsfield_to_dscp(fl4.flowi4_tos);
+ fri.dscp = res.dscp;
fri.type = rt->rt_type;
fri.offload = 0;
fri.trap = 0;
@@ -3376,8 +3358,8 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0);
} else {
- err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
- NETLINK_CB(in_skb).portid,
+ err = rt_fill_info(net, dst, src, rt, table_id, res.dscp, &fl4,
+ skb, NETLINK_CB(in_skb).portid,
nlh->nlmsg_seq, 0);
}
if (err < 0)
@@ -3406,7 +3388,7 @@ static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
static int ip_rt_gc_elasticity __read_mostly = 8;
static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU;
-static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
+static int ipv4_sysctl_rtcache_flush(const struct ctl_table *__ctl, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct net *net = (struct net *)__ctl->extra1;
@@ -3507,7 +3489,6 @@ static struct ctl_table ipv4_route_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
- { }
};
static const char ipv4_route_flush_procname[] = "flush";
@@ -3541,7 +3522,6 @@ static struct ctl_table ipv4_route_netns_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
- { },
};
static __net_init int sysctl_route_net_init(struct net *net)
@@ -3559,16 +3539,14 @@ static __net_init int sysctl_route_net_init(struct net *net)
/* Don't export non-whitelisted sysctls to unprivileged users */
if (net->user_ns != &init_user_ns) {
- if (tbl[0].procname != ipv4_route_flush_procname) {
- tbl[0].procname = NULL;
+ if (tbl[0].procname != ipv4_route_flush_procname)
table_size = 0;
- }
}
/* Update the variables to point into the current struct net
* except for the first element flush
*/
- for (i = 1; i < ARRAY_SIZE(ipv4_route_netns_table) - 1; i++)
+ for (i = 1; i < table_size; i++)
tbl[i].data += (void *)net - (void *)&init_net;
}
tbl[0].extra1 = net;
@@ -3588,7 +3566,7 @@ err_dup:
static __net_exit void sysctl_route_net_exit(struct net *net)
{
- struct ctl_table *tbl;
+ const struct ctl_table *tbl;
tbl = net->ipv4.route_hdr->ctl_table_arg;
unregister_net_sysctl_table(net->ipv4.route_hdr);
@@ -3682,7 +3660,6 @@ int __init ip_rt_init(void)
struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
INIT_LIST_HEAD(&ul->head);
- INIT_LIST_HEAD(&ul->quarantine);
spin_lock_init(&ul->lock);
}
#ifdef CONFIG_IP_ROUTE_CLASSID
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 500f665f98cb..1948d15f1f28 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -462,7 +462,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
}
/* Try to redo what tcp_v4_send_synack did. */
- req->rsk_window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW);
+ req->rsk_window_clamp = READ_ONCE(tp->window_clamp) ? :
+ dst_metric(&rt->dst, RTAX_WINDOW);
/* limit the window selection if the user enforce a smaller rx buffer */
full_space = tcp_full_space(sk);
if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
@@ -495,6 +496,6 @@ out:
out_free:
reqsk_free(req);
out_drop:
- kfree_skb_reason(skb, reason);
+ sk_skb_reason_drop(sk, skb, reason);
return NULL;
}
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 7e4f16a7dcc1..a79b2a52ce01 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -62,7 +62,7 @@ static void set_local_port_range(struct net *net, unsigned int low, unsigned int
}
/* Validate changes from /proc interface. */
-static int ipv4_local_port_range(struct ctl_table *table, int write,
+static int ipv4_local_port_range(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct net *net = table->data;
@@ -96,7 +96,7 @@ static int ipv4_local_port_range(struct ctl_table *table, int write,
}
/* Validate changes from /proc interface. */
-static int ipv4_privileged_ports(struct ctl_table *table, int write,
+static int ipv4_privileged_ports(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct net *net = container_of(table->data, struct net,
@@ -130,7 +130,8 @@ static int ipv4_privileged_ports(struct ctl_table *table, int write,
return ret;
}
-static void inet_get_ping_group_range_table(struct ctl_table *table, kgid_t *low, kgid_t *high)
+static void inet_get_ping_group_range_table(const struct ctl_table *table,
+ kgid_t *low, kgid_t *high)
{
kgid_t *data = table->data;
struct net *net =
@@ -145,7 +146,8 @@ static void inet_get_ping_group_range_table(struct ctl_table *table, kgid_t *low
}
/* Update system visible IP port range */
-static void set_ping_group_range(struct ctl_table *table, kgid_t low, kgid_t high)
+static void set_ping_group_range(const struct ctl_table *table,
+ kgid_t low, kgid_t high)
{
kgid_t *data = table->data;
struct net *net =
@@ -157,7 +159,7 @@ static void set_ping_group_range(struct ctl_table *table, kgid_t low, kgid_t hig
}
/* Validate changes from /proc interface. */
-static int ipv4_ping_group_range(struct ctl_table *table, int write,
+static int ipv4_ping_group_range(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct user_namespace *user_ns = current_user_ns();
@@ -192,7 +194,7 @@ static int ipv4_ping_group_range(struct ctl_table *table, int write,
return ret;
}
-static int ipv4_fwd_update_priority(struct ctl_table *table, int write,
+static int ipv4_fwd_update_priority(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct net *net;
@@ -208,7 +210,7 @@ static int ipv4_fwd_update_priority(struct ctl_table *table, int write,
return ret;
}
-static int proc_tcp_congestion_control(struct ctl_table *ctl, int write,
+static int proc_tcp_congestion_control(const struct ctl_table *ctl, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct net *net = container_of(ctl->data, struct net,
@@ -228,7 +230,7 @@ static int proc_tcp_congestion_control(struct ctl_table *ctl, int write,
return ret;
}
-static int proc_tcp_available_congestion_control(struct ctl_table *ctl,
+static int proc_tcp_available_congestion_control(const struct ctl_table *ctl,
int write, void *buffer,
size_t *lenp, loff_t *ppos)
{
@@ -244,7 +246,7 @@ static int proc_tcp_available_congestion_control(struct ctl_table *ctl,
return ret;
}
-static int proc_allowed_congestion_control(struct ctl_table *ctl,
+static int proc_allowed_congestion_control(const struct ctl_table *ctl,
int write, void *buffer,
size_t *lenp, loff_t *ppos)
{
@@ -281,7 +283,7 @@ static int sscanf_key(char *buf, __le32 *key)
return ret;
}
-static int proc_tcp_fastopen_key(struct ctl_table *table, int write,
+static int proc_tcp_fastopen_key(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct net *net = container_of(table->data, struct net,
@@ -352,7 +354,7 @@ bad_key:
return ret;
}
-static int proc_tfo_blackhole_detect_timeout(struct ctl_table *table,
+static int proc_tfo_blackhole_detect_timeout(const struct ctl_table *table,
int write, void *buffer,
size_t *lenp, loff_t *ppos)
{
@@ -367,7 +369,7 @@ static int proc_tfo_blackhole_detect_timeout(struct ctl_table *table,
return ret;
}
-static int proc_tcp_available_ulp(struct ctl_table *ctl,
+static int proc_tcp_available_ulp(const struct ctl_table *ctl,
int write, void *buffer, size_t *lenp,
loff_t *ppos)
{
@@ -384,7 +386,7 @@ static int proc_tcp_available_ulp(struct ctl_table *ctl,
return ret;
}
-static int proc_tcp_ehash_entries(struct ctl_table *table, int write,
+static int proc_tcp_ehash_entries(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct net *net = container_of(table->data, struct net,
@@ -408,7 +410,7 @@ static int proc_tcp_ehash_entries(struct ctl_table *table, int write,
return proc_dointvec(&tbl, write, buffer, lenp, ppos);
}
-static int proc_udp_hash_entries(struct ctl_table *table, int write,
+static int proc_udp_hash_entries(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct net *net = container_of(table->data, struct net,
@@ -432,7 +434,7 @@ static int proc_udp_hash_entries(struct ctl_table *table, int write,
}
#ifdef CONFIG_IP_ROUTE_MULTIPATH
-static int proc_fib_multipath_hash_policy(struct ctl_table *table, int write,
+static int proc_fib_multipath_hash_policy(const struct ctl_table *table, int write,
void *buffer, size_t *lenp,
loff_t *ppos)
{
@@ -447,7 +449,7 @@ static int proc_fib_multipath_hash_policy(struct ctl_table *table, int write,
return ret;
}
-static int proc_fib_multipath_hash_fields(struct ctl_table *table, int write,
+static int proc_fib_multipath_hash_fields(const struct ctl_table *table, int write,
void *buffer, size_t *lenp,
loff_t *ppos)
{
@@ -462,6 +464,61 @@ static int proc_fib_multipath_hash_fields(struct ctl_table *table, int write,
return ret;
}
+
+static u32 proc_fib_multipath_hash_rand_seed __ro_after_init;
+
+static void proc_fib_multipath_hash_init_rand_seed(void)
+{
+ get_random_bytes(&proc_fib_multipath_hash_rand_seed,
+ sizeof(proc_fib_multipath_hash_rand_seed));
+}
+
+static void proc_fib_multipath_hash_set_seed(struct net *net, u32 user_seed)
+{
+ struct sysctl_fib_multipath_hash_seed new = {
+ .user_seed = user_seed,
+ .mp_seed = (user_seed ? user_seed :
+ proc_fib_multipath_hash_rand_seed),
+ };
+
+ WRITE_ONCE(net->ipv4.sysctl_fib_multipath_hash_seed, new);
+}
+
+static int proc_fib_multipath_hash_seed(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ struct sysctl_fib_multipath_hash_seed *mphs;
+ struct net *net = table->data;
+ struct ctl_table tmp;
+ u32 user_seed;
+ int ret;
+
+ mphs = &net->ipv4.sysctl_fib_multipath_hash_seed;
+ user_seed = mphs->user_seed;
+
+ tmp = *table;
+ tmp.data = &user_seed;
+
+ ret = proc_douintvec_minmax(&tmp, write, buffer, lenp, ppos);
+
+ if (write && ret == 0) {
+ proc_fib_multipath_hash_set_seed(net, user_seed);
+ call_netevent_notifiers(NETEVENT_IPV4_MPATH_HASH_UPDATE, net);
+ }
+
+ return ret;
+}
+#else
+
+static void proc_fib_multipath_hash_init_rand_seed(void)
+{
+}
+
+static void proc_fib_multipath_hash_set_seed(struct net *net, u32 user_seed)
+{
+}
+
#endif
static struct ctl_table ipv4_table[] = {
@@ -544,22 +601,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_tcp_available_ulp,
},
{
- .procname = "icmp_msgs_per_sec",
- .data = &sysctl_icmp_msgs_per_sec,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- },
- {
- .procname = "icmp_msgs_burst",
- .data = &sysctl_icmp_msgs_burst,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- },
- {
.procname = "udp_mem",
.data = &sysctl_udp_mem,
.maxlen = sizeof(sysctl_udp_mem),
@@ -575,7 +616,6 @@ static struct ctl_table ipv4_table[] = {
.extra1 = &sysctl_fib_sync_mem_min,
.extra2 = &sysctl_fib_sync_mem_max,
},
- { }
};
static struct ctl_table ipv4_net_table[] = {
@@ -646,6 +686,22 @@ static struct ctl_table ipv4_net_table[] = {
.proc_handler = proc_dointvec
},
{
+ .procname = "icmp_msgs_per_sec",
+ .data = &init_net.ipv4.sysctl_icmp_msgs_per_sec,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ },
+ {
+ .procname = "icmp_msgs_burst",
+ .data = &init_net.ipv4.sysctl_icmp_msgs_burst,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ },
+ {
.procname = "ping_group_range",
.data = &init_net.ipv4.ping_group_range.range,
.maxlen = sizeof(gid_t)*2,
@@ -1071,6 +1127,13 @@ static struct ctl_table ipv4_net_table[] = {
.extra1 = SYSCTL_ONE,
.extra2 = &fib_multipath_hash_fields_all_mask,
},
+ {
+ .procname = "fib_multipath_hash_seed",
+ .data = &init_net,
+ .maxlen = sizeof(u32),
+ .mode = 0644,
+ .proc_handler = proc_fib_multipath_hash_seed,
+ },
#endif
{
.procname = "ip_unprivileged_port_start",
@@ -1502,11 +1565,19 @@ static struct ctl_table ipv4_net_table[] = {
.proc_handler = proc_dou8vec_minmax,
.extra1 = SYSCTL_ONE,
},
- { }
+ {
+ .procname = "tcp_rto_min_us",
+ .data = &init_net.ipv4.sysctl_tcp_rto_min_us,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ONE,
+ },
};
static __net_init int ipv4_sysctl_init_net(struct net *net)
{
+ size_t table_size = ARRAY_SIZE(ipv4_net_table);
struct ctl_table *table;
table = ipv4_net_table;
@@ -1517,7 +1588,7 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
if (!table)
goto err_alloc;
- for (i = 0; i < ARRAY_SIZE(ipv4_net_table) - 1; i++) {
+ for (i = 0; i < table_size; i++) {
if (table[i].data) {
/* Update the variables to point into
* the current struct net
@@ -1533,7 +1604,7 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
}
net->ipv4.ipv4_hdr = register_net_sysctl_sz(net, "net/ipv4", table,
- ARRAY_SIZE(ipv4_net_table));
+ table_size);
if (!net->ipv4.ipv4_hdr)
goto err_reg;
@@ -1541,6 +1612,8 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
if (!net->ipv4.sysctl_local_reserved_ports)
goto err_ports;
+ proc_fib_multipath_hash_set_seed(net, 0);
+
return 0;
err_ports:
@@ -1554,7 +1627,7 @@ err_alloc:
static __net_exit void ipv4_sysctl_exit_net(struct net *net)
{
- struct ctl_table *table;
+ const struct ctl_table *table;
kfree(net->ipv4.sysctl_local_reserved_ports);
table = net->ipv4.ipv4_hdr->ctl_table_arg;
@@ -1575,6 +1648,8 @@ static __init int sysctl_ipv4_init(void)
if (!hdr)
return -ENOMEM;
+ proc_fib_multipath_hash_init_rand_seed();
+
if (register_pernet_subsys(&ipv4_sysctl_ops)) {
unregister_net_sysctl_table(hdr);
return -ENOMEM;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index e767721b3a58..4f77bd862e95 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -272,15 +272,21 @@
#include <net/inet_common.h>
#include <net/tcp.h>
#include <net/mptcp.h>
+#include <net/proto_memory.h>
#include <net/xfrm.h>
#include <net/ip.h>
#include <net/sock.h>
+#include <net/rstreason.h>
#include <linux/uaccess.h>
#include <asm/ioctls.h>
#include <net/busy_poll.h>
+#include <net/hotdata.h>
+#include <trace/events/tcp.h>
#include <net/rps.h>
+#include "../core/devmem.h"
+
/* Track pending CMSGs. */
enum {
TCP_CMSG_INQ = 1,
@@ -290,6 +296,9 @@ enum {
DEFINE_PER_CPU(unsigned int, tcp_orphan_count);
EXPORT_PER_CPU_SYMBOL_GPL(tcp_orphan_count);
+DEFINE_PER_CPU(u32, tcp_tw_isn);
+EXPORT_PER_CPU_SYMBOL_GPL(tcp_tw_isn);
+
long sysctl_tcp_mem[3] __read_mostly;
EXPORT_SYMBOL(sysctl_tcp_mem);
@@ -414,6 +423,7 @@ void tcp_init_sock(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
+ int rto_min_us;
tp->out_of_order_queue = RB_ROOT;
sk->tcp_rtx_queue = RB_ROOT;
@@ -422,7 +432,8 @@ void tcp_init_sock(struct sock *sk)
INIT_LIST_HEAD(&tp->tsorted_sent_queue);
icsk->icsk_rto = TCP_TIMEOUT_INIT;
- icsk->icsk_rto_min = TCP_RTO_MIN;
+ rto_min_us = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rto_min_us);
+ icsk->icsk_rto_min = usecs_to_jiffies(rto_min_us);
icsk->icsk_delack_max = TCP_DELACK_MAX;
tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U);
@@ -462,6 +473,7 @@ void tcp_init_sock(struct sock *sk)
set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags);
sk_sockets_allocated_inc(sk);
+ xa_init_flags(&sk->sk_user_frags, XA_FLAGS_ALLOC1);
}
EXPORT_SYMBOL(tcp_init_sock);
@@ -592,7 +604,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
*/
mask |= EPOLLOUT | EPOLLWRNORM;
}
- /* This barrier is coupled with smp_wmb() in tcp_reset() */
+ /* This barrier is coupled with smp_wmb() in tcp_done_with_error() */
smp_rmb();
if (READ_ONCE(sk->sk_err) ||
!skb_queue_empty_lockless(&sk->sk_error_queue))
@@ -1159,6 +1171,9 @@ new_segment:
process_backlog++;
+#ifdef CONFIG_SKB_DECRYPTED
+ skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED);
+#endif
tcp_skb_entail(sk, skb);
copy = size_goal;
@@ -1184,7 +1199,7 @@ new_segment:
if (!skb_can_coalesce(skb, i, pfrag->page,
pfrag->offset)) {
- if (i >= READ_ONCE(sysctl_max_skb_frags)) {
+ if (i >= READ_ONCE(net_hotdata.sysctl_max_skb_frags)) {
tcp_mark_push(tp, skb);
goto new_segment;
}
@@ -1416,8 +1431,6 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
struct sk_buff *skb;
int copied = 0, err = 0;
- /* XXX -- need to support SO_PEEK_OFF */
-
skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
if (err)
@@ -1721,7 +1734,7 @@ int tcp_set_rcvlowat(struct sock *sk, int val)
space = tcp_space_from_win(sk, val);
if (space > sk->sk_rcvbuf) {
WRITE_ONCE(sk->sk_rcvbuf, space);
- tcp_sk(sk)->window_clamp = val;
+ WRITE_ONCE(tcp_sk(sk)->window_clamp, val);
}
return 0;
}
@@ -2150,6 +2163,9 @@ static int tcp_zerocopy_receive(struct sock *sk,
skb = tcp_recv_skb(sk, seq, &offset);
}
+ if (!skb_frags_readable(skb))
+ break;
+
if (TCP_SKB_CB(skb)->has_rxtstamp) {
tcp_update_recv_tstamps(skb, tss);
zc->msg_flags |= TCP_CMSG_TS;
@@ -2167,6 +2183,9 @@ static int tcp_zerocopy_receive(struct sock *sk,
break;
}
page = skb_frag_page(frags);
+ if (WARN_ON_ONCE(!page))
+ break;
+
prefetchw(page);
pages[pages_to_map++] = page;
length += PAGE_SIZE;
@@ -2225,6 +2244,7 @@ void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
struct scm_timestamping_internal *tss)
{
int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW);
+ u32 tsflags = READ_ONCE(sk->sk_tsflags);
bool has_timestamping = false;
if (tss->ts[0].tv_sec || tss->ts[0].tv_nsec) {
@@ -2264,14 +2284,18 @@ void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
}
}
- if (READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_SOFTWARE)
+ if (tsflags & SOF_TIMESTAMPING_SOFTWARE &&
+ (tsflags & SOF_TIMESTAMPING_RX_SOFTWARE ||
+ !(tsflags & SOF_TIMESTAMPING_OPT_RX_FILTER)))
has_timestamping = true;
else
tss->ts[0] = (struct timespec64) {0};
}
if (tss->ts[2].tv_sec || tss->ts[2].tv_nsec) {
- if (READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_RAW_HARDWARE)
+ if (tsflags & SOF_TIMESTAMPING_RAW_HARDWARE &&
+ (tsflags & SOF_TIMESTAMPING_RX_HARDWARE ||
+ !(tsflags & SOF_TIMESTAMPING_OPT_RX_FILTER)))
has_timestamping = true;
else
tss->ts[2] = (struct timespec64) {0};
@@ -2307,6 +2331,220 @@ static int tcp_inq_hint(struct sock *sk)
return inq;
}
+/* batch __xa_alloc() calls and reduce xa_lock()/xa_unlock() overhead. */
+struct tcp_xa_pool {
+ u8 max; /* max <= MAX_SKB_FRAGS */
+ u8 idx; /* idx <= max */
+ __u32 tokens[MAX_SKB_FRAGS];
+ netmem_ref netmems[MAX_SKB_FRAGS];
+};
+
+static void tcp_xa_pool_commit_locked(struct sock *sk, struct tcp_xa_pool *p)
+{
+ int i;
+
+ /* Commit part that has been copied to user space. */
+ for (i = 0; i < p->idx; i++)
+ __xa_cmpxchg(&sk->sk_user_frags, p->tokens[i], XA_ZERO_ENTRY,
+ (__force void *)p->netmems[i], GFP_KERNEL);
+ /* Rollback what has been pre-allocated and is no longer needed. */
+ for (; i < p->max; i++)
+ __xa_erase(&sk->sk_user_frags, p->tokens[i]);
+
+ p->max = 0;
+ p->idx = 0;
+}
+
+static void tcp_xa_pool_commit(struct sock *sk, struct tcp_xa_pool *p)
+{
+ if (!p->max)
+ return;
+
+ xa_lock_bh(&sk->sk_user_frags);
+
+ tcp_xa_pool_commit_locked(sk, p);
+
+ xa_unlock_bh(&sk->sk_user_frags);
+}
+
+static int tcp_xa_pool_refill(struct sock *sk, struct tcp_xa_pool *p,
+ unsigned int max_frags)
+{
+ int err, k;
+
+ if (p->idx < p->max)
+ return 0;
+
+ xa_lock_bh(&sk->sk_user_frags);
+
+ tcp_xa_pool_commit_locked(sk, p);
+
+ for (k = 0; k < max_frags; k++) {
+ err = __xa_alloc(&sk->sk_user_frags, &p->tokens[k],
+ XA_ZERO_ENTRY, xa_limit_31b, GFP_KERNEL);
+ if (err)
+ break;
+ }
+
+ xa_unlock_bh(&sk->sk_user_frags);
+
+ p->max = k;
+ p->idx = 0;
+ return k ? 0 : err;
+}
+
+/* On error, returns the -errno. On success, returns number of bytes sent to the
+ * user. May not consume all of @remaining_len.
+ */
+static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb,
+ unsigned int offset, struct msghdr *msg,
+ int remaining_len)
+{
+ struct dmabuf_cmsg dmabuf_cmsg = { 0 };
+ struct tcp_xa_pool tcp_xa_pool;
+ unsigned int start;
+ int i, copy, n;
+ int sent = 0;
+ int err = 0;
+
+ tcp_xa_pool.max = 0;
+ tcp_xa_pool.idx = 0;
+ do {
+ start = skb_headlen(skb);
+
+ if (skb_frags_readable(skb)) {
+ err = -ENODEV;
+ goto out;
+ }
+
+ /* Copy header. */
+ copy = start - offset;
+ if (copy > 0) {
+ copy = min(copy, remaining_len);
+
+ n = copy_to_iter(skb->data + offset, copy,
+ &msg->msg_iter);
+ if (n != copy) {
+ err = -EFAULT;
+ goto out;
+ }
+
+ offset += copy;
+ remaining_len -= copy;
+
+ /* First a dmabuf_cmsg for # bytes copied to user
+ * buffer.
+ */
+ memset(&dmabuf_cmsg, 0, sizeof(dmabuf_cmsg));
+ dmabuf_cmsg.frag_size = copy;
+ err = put_cmsg(msg, SOL_SOCKET, SO_DEVMEM_LINEAR,
+ sizeof(dmabuf_cmsg), &dmabuf_cmsg);
+ if (err || msg->msg_flags & MSG_CTRUNC) {
+ msg->msg_flags &= ~MSG_CTRUNC;
+ if (!err)
+ err = -ETOOSMALL;
+ goto out;
+ }
+
+ sent += copy;
+
+ if (remaining_len == 0)
+ goto out;
+ }
+
+ /* after that, send information of dmabuf pages through a
+ * sequence of cmsg
+ */
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+ struct net_iov *niov;
+ u64 frag_offset;
+ int end;
+
+ /* !skb_frags_readable() should indicate that ALL the
+ * frags in this skb are dmabuf net_iovs. We're checking
+ * for that flag above, but also check individual frags
+ * here. If the tcp stack is not setting
+ * skb_frags_readable() correctly, we still don't want
+ * to crash here.
+ */
+ if (!skb_frag_net_iov(frag)) {
+ net_err_ratelimited("Found non-dmabuf skb with net_iov");
+ err = -ENODEV;
+ goto out;
+ }
+
+ niov = skb_frag_net_iov(frag);
+ end = start + skb_frag_size(frag);
+ copy = end - offset;
+
+ if (copy > 0) {
+ copy = min(copy, remaining_len);
+
+ frag_offset = net_iov_virtual_addr(niov) +
+ skb_frag_off(frag) + offset -
+ start;
+ dmabuf_cmsg.frag_offset = frag_offset;
+ dmabuf_cmsg.frag_size = copy;
+ err = tcp_xa_pool_refill(sk, &tcp_xa_pool,
+ skb_shinfo(skb)->nr_frags - i);
+ if (err)
+ goto out;
+
+ /* Will perform the exchange later */
+ dmabuf_cmsg.frag_token = tcp_xa_pool.tokens[tcp_xa_pool.idx];
+ dmabuf_cmsg.dmabuf_id = net_iov_binding_id(niov);
+
+ offset += copy;
+ remaining_len -= copy;
+
+ err = put_cmsg(msg, SOL_SOCKET,
+ SO_DEVMEM_DMABUF,
+ sizeof(dmabuf_cmsg),
+ &dmabuf_cmsg);
+ if (err || msg->msg_flags & MSG_CTRUNC) {
+ msg->msg_flags &= ~MSG_CTRUNC;
+ if (!err)
+ err = -ETOOSMALL;
+ goto out;
+ }
+
+ atomic_long_inc(&niov->pp_ref_count);
+ tcp_xa_pool.netmems[tcp_xa_pool.idx++] = skb_frag_netmem(frag);
+
+ sent += copy;
+
+ if (remaining_len == 0)
+ goto out;
+ }
+ start = end;
+ }
+
+ tcp_xa_pool_commit(sk, &tcp_xa_pool);
+ if (!remaining_len)
+ goto out;
+
+ /* if remaining_len is not satisfied yet, we need to go to the
+ * next frag in the frag_list to satisfy remaining_len.
+ */
+ skb = skb_shinfo(skb)->frag_list ?: skb->next;
+
+ offset = offset - start;
+ } while (skb);
+
+ if (remaining_len) {
+ err = -EFAULT;
+ goto out;
+ }
+
+out:
+ tcp_xa_pool_commit(sk, &tcp_xa_pool);
+ if (!sent)
+ sent = err;
+
+ return sent;
+}
+
/*
* This routine copies from a sock struct into the user buffer.
*
@@ -2320,6 +2558,7 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
int *cmsg_flags)
{
struct tcp_sock *tp = tcp_sk(sk);
+ int last_copied_dmabuf = -1; /* uninitialized */
int copied = 0;
u32 peek_seq;
u32 *seq;
@@ -2328,6 +2567,7 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
int target; /* Read at least this many bytes */
long timeo;
struct sk_buff *skb, *last;
+ u32 peek_offset = 0;
u32 urg_hole = 0;
err = -ENOTCONN;
@@ -2361,7 +2601,8 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
seq = &tp->copied_seq;
if (flags & MSG_PEEK) {
- peek_seq = tp->copied_seq;
+ peek_offset = max(sk_peek_offset(sk, flags), 0);
+ peek_seq = tp->copied_seq + peek_offset;
seq = &peek_seq;
}
@@ -2464,11 +2705,11 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
}
if ((flags & MSG_PEEK) &&
- (peek_seq - copied - urg_hole != tp->copied_seq)) {
+ (peek_seq - peek_offset - copied - urg_hole != tp->copied_seq)) {
net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n",
current->comm,
task_pid_nr(current));
- peek_seq = tp->copied_seq;
+ peek_seq = tp->copied_seq + peek_offset;
}
continue;
@@ -2497,19 +2738,51 @@ found_ok_skb:
}
if (!(flags & MSG_TRUNC)) {
- err = skb_copy_datagram_msg(skb, offset, msg, used);
- if (err) {
- /* Exception. Bailout! */
- if (!copied)
- copied = -EFAULT;
+ if (last_copied_dmabuf != -1 &&
+ last_copied_dmabuf != !skb_frags_readable(skb))
break;
+
+ if (skb_frags_readable(skb)) {
+ err = skb_copy_datagram_msg(skb, offset, msg,
+ used);
+ if (err) {
+ /* Exception. Bailout! */
+ if (!copied)
+ copied = -EFAULT;
+ break;
+ }
+ } else {
+ if (!(flags & MSG_SOCK_DEVMEM)) {
+ /* dmabuf skbs can only be received
+ * with the MSG_SOCK_DEVMEM flag.
+ */
+ if (!copied)
+ copied = -EFAULT;
+
+ break;
+ }
+
+ err = tcp_recvmsg_dmabuf(sk, skb, offset, msg,
+ used);
+ if (err <= 0) {
+ if (!copied)
+ copied = -EFAULT;
+
+ break;
+ }
+ used = err;
}
}
+ last_copied_dmabuf = !skb_frags_readable(skb);
+
WRITE_ONCE(*seq, *seq + used);
copied += used;
len -= used;
-
+ if (flags & MSG_PEEK)
+ sk_peek_offset_fwd(sk, used);
+ else
+ sk_peek_offset_bwd(sk, used);
tcp_rcv_space_adjust(sk);
skip_copy:
@@ -2637,6 +2910,10 @@ void tcp_set_state(struct sock *sk, int state)
if (oldstate != TCP_ESTABLISHED)
TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
break;
+ case TCP_CLOSE_WAIT:
+ if (oldstate == TCP_SYN_RECV)
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
+ break;
case TCP_CLOSE:
if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED)
@@ -2648,7 +2925,7 @@ void tcp_set_state(struct sock *sk, int state)
inet_put_port(sk);
fallthrough;
default:
- if (oldstate == TCP_ESTABLISHED)
+ if (oldstate == TCP_ESTABLISHED || oldstate == TCP_CLOSE_WAIT)
TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
}
@@ -2710,7 +2987,7 @@ void tcp_shutdown(struct sock *sk, int how)
/* If we've already sent a FIN, or it's a closed state, skip this. */
if ((1 << sk->sk_state) &
(TCPF_ESTABLISHED | TCPF_SYN_SENT |
- TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
+ TCPF_CLOSE_WAIT)) {
/* Clear out any half completed packets. FIN if needed. */
if (tcp_close_state(sk))
tcp_send_fin(sk);
@@ -2744,7 +3021,15 @@ static bool tcp_too_many_orphans(int shift)
READ_ONCE(sysctl_tcp_max_orphans);
}
-bool tcp_check_oom(struct sock *sk, int shift)
+static bool tcp_out_of_memory(const struct sock *sk)
+{
+ if (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
+ sk_memory_allocated(sk) > sk_prot_mem_limits(sk, 2))
+ return true;
+ return false;
+}
+
+bool tcp_check_oom(const struct sock *sk, int shift)
{
bool too_many_orphans, out_of_socket_memory;
@@ -2805,7 +3090,8 @@ void __tcp_close(struct sock *sk, long timeout)
/* Unread data was tossed, zap the connection. */
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
tcp_set_state(sk, TCP_CLOSE);
- tcp_send_active_reset(sk, sk->sk_allocation);
+ tcp_send_active_reset(sk, sk->sk_allocation,
+ SK_RST_REASON_TCP_ABORT_ON_CLOSE);
} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
/* Check zero linger _after_ checking for unread data. */
sk->sk_prot->disconnect(sk, 0);
@@ -2819,7 +3105,7 @@ void __tcp_close(struct sock *sk, long timeout)
* machine. State transitions:
*
* TCP_ESTABLISHED -> TCP_FIN_WAIT1
- * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
+ * TCP_SYN_RECV -> TCP_FIN_WAIT1 (it is difficult)
* TCP_CLOSE_WAIT -> TCP_LAST_ACK
*
* are legal only when FIN has been sent (i.e. in window),
@@ -2879,7 +3165,8 @@ adjudge_to_death:
struct tcp_sock *tp = tcp_sk(sk);
if (READ_ONCE(tp->linger2) < 0) {
tcp_set_state(sk, TCP_CLOSE);
- tcp_send_active_reset(sk, GFP_ATOMIC);
+ tcp_send_active_reset(sk, GFP_ATOMIC,
+ SK_RST_REASON_TCP_ABORT_ON_LINGER);
__NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPABORTONLINGER);
} else {
@@ -2897,7 +3184,8 @@ adjudge_to_death:
if (sk->sk_state != TCP_CLOSE) {
if (tcp_check_oom(sk, 0)) {
tcp_set_state(sk, TCP_CLOSE);
- tcp_send_active_reset(sk, GFP_ATOMIC);
+ tcp_send_active_reset(sk, GFP_ATOMIC,
+ SK_RST_REASON_TCP_ABORT_ON_MEMORY);
__NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPABORTONMEMORY);
} else if (!check_net(sock_net(sk))) {
@@ -2995,13 +3283,16 @@ int tcp_disconnect(struct sock *sk, int flags)
inet_csk_listen_stop(sk);
} else if (unlikely(tp->repair)) {
WRITE_ONCE(sk->sk_err, ECONNABORTED);
- } else if (tcp_need_reset(old_state) ||
- (tp->snd_nxt != tp->write_seq &&
- (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
+ } else if (tcp_need_reset(old_state)) {
+ tcp_send_active_reset(sk, gfp_any(), SK_RST_REASON_TCP_STATE);
+ WRITE_ONCE(sk->sk_err, ECONNRESET);
+ } else if (tp->snd_nxt != tp->write_seq &&
+ (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK)) {
/* The last check adjusts for discrepancy of Linux wrt. RFC
* states
*/
- tcp_send_active_reset(sk, gfp_any());
+ tcp_send_active_reset(sk, gfp_any(),
+ SK_RST_REASON_TCP_DISCONNECT_WITH_DATA);
WRITE_ONCE(sk->sk_err, ECONNRESET);
} else if (old_state == TCP_SYN_SENT)
WRITE_ONCE(sk->sk_err, ECONNRESET);
@@ -3010,6 +3301,7 @@ int tcp_disconnect(struct sock *sk, int flags)
__skb_queue_purge(&sk->sk_receive_queue);
WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
WRITE_ONCE(tp->urg_data, 0);
+ sk_set_peek_off(sk, -1);
tcp_write_queue_purge(sk);
tcp_fastopen_active_disable_ofo_check(sk);
skb_rbtree_purge(&tp->out_of_order_queue);
@@ -3058,7 +3350,7 @@ int tcp_disconnect(struct sock *sk, int flags)
icsk->icsk_ack.rcv_mss = TCP_MIN_MSS;
memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
__sk_dst_reset(sk);
- dst_release(xchg((__force struct dst_entry **)&sk->sk_rx_dst, NULL));
+ dst_release(unrcu_pointer(xchg(&sk->sk_rx_dst, NULL)));
tcp_saved_syn_free(tp);
tp->compressed_ack = 0;
tp->segs_in = 0;
@@ -3379,7 +3671,7 @@ int tcp_set_window_clamp(struct sock *sk, int val)
if (!val) {
if (sk->sk_state != TCP_CLOSE)
return -EINVAL;
- tp->window_clamp = 0;
+ WRITE_ONCE(tp->window_clamp, 0);
} else {
u32 new_rcv_ssthresh, old_window_clamp = tp->window_clamp;
u32 new_window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
@@ -3388,7 +3680,7 @@ int tcp_set_window_clamp(struct sock *sk, int val)
if (new_window_clamp == old_window_clamp)
return 0;
- tp->window_clamp = new_window_clamp;
+ WRITE_ONCE(tp->window_clamp, new_window_clamp);
if (new_window_clamp < old_window_clamp) {
/* need to apply the reserved mem provisioning only
* when shrinking the window clamp
@@ -4057,7 +4349,7 @@ int do_tcp_getsockopt(struct sock *sk, int level,
TCP_RTO_MAX / HZ);
break;
case TCP_WINDOW_CLAMP:
- val = tp->window_clamp;
+ val = READ_ONCE(tp->window_clamp);
break;
case TCP_INFO: {
struct tcp_info info;
@@ -4342,6 +4634,9 @@ zerocopy_rcv_out:
return err;
}
+ case TCP_IS_MPTCP:
+ val = 0;
+ break;
default:
return -ENOPROTOOPT;
}
@@ -4430,7 +4725,7 @@ int tcp_md5_hash_key(struct tcp_sigpool *hp,
EXPORT_SYMBOL(tcp_md5_hash_key);
/* Called with rcu_read_lock() */
-enum skb_drop_reason
+static enum skb_drop_reason
tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb,
const void *saddr, const void *daddr,
int family, int l3index, const __u8 *hash_location)
@@ -4450,7 +4745,7 @@ tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb,
if (!key && hash_location) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
- tcp_hash_fail("Unexpected MD5 Hash found", family, skb, "");
+ trace_tcp_hash_md5_unexpected(sk, skb);
return SKB_DROP_REASON_TCP_MD5UNEXPECTED;
}
@@ -4465,29 +4760,90 @@ tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb,
NULL, skb);
if (genhash || memcmp(hash_location, newhash, 16) != 0) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
- if (family == AF_INET) {
- tcp_hash_fail("MD5 Hash failed", AF_INET, skb, "%s L3 index %d",
- genhash ? "tcp_v4_calc_md5_hash failed"
- : "", l3index);
- } else {
- if (genhash) {
- tcp_hash_fail("MD5 Hash failed",
- AF_INET6, skb, "L3 index %d",
- l3index);
- } else {
- tcp_hash_fail("MD5 Hash mismatch",
- AF_INET6, skb, "L3 index %d",
- l3index);
- }
- }
+ trace_tcp_hash_md5_mismatch(sk, skb);
return SKB_DROP_REASON_TCP_MD5FAILURE;
}
return SKB_NOT_DROPPED_YET;
}
-EXPORT_SYMBOL(tcp_inbound_md5_hash);
+#else
+static inline enum skb_drop_reason
+tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb,
+ const void *saddr, const void *daddr,
+ int family, int l3index, const __u8 *hash_location)
+{
+ return SKB_NOT_DROPPED_YET;
+}
#endif
+/* Called with rcu_read_lock() */
+enum skb_drop_reason
+tcp_inbound_hash(struct sock *sk, const struct request_sock *req,
+ const struct sk_buff *skb,
+ const void *saddr, const void *daddr,
+ int family, int dif, int sdif)
+{
+ const struct tcphdr *th = tcp_hdr(skb);
+ const struct tcp_ao_hdr *aoh;
+ const __u8 *md5_location;
+ int l3index;
+
+ /* Invalid option or two times meet any of auth options */
+ if (tcp_parse_auth_options(th, &md5_location, &aoh)) {
+ trace_tcp_hash_bad_header(sk, skb);
+ return SKB_DROP_REASON_TCP_AUTH_HDR;
+ }
+
+ if (req) {
+ if (tcp_rsk_used_ao(req) != !!aoh) {
+ u8 keyid, rnext, maclen;
+
+ if (aoh) {
+ keyid = aoh->keyid;
+ rnext = aoh->rnext_keyid;
+ maclen = tcp_ao_hdr_maclen(aoh);
+ } else {
+ keyid = rnext = maclen = 0;
+ }
+
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOBAD);
+ trace_tcp_ao_handshake_failure(sk, skb, keyid, rnext, maclen);
+ return SKB_DROP_REASON_TCP_AOFAILURE;
+ }
+ }
+
+ /* sdif set, means packet ingressed via a device
+ * in an L3 domain and dif is set to the l3mdev
+ */
+ l3index = sdif ? dif : 0;
+
+ /* Fast path: unsigned segments */
+ if (likely(!md5_location && !aoh)) {
+ /* Drop if there's TCP-MD5 or TCP-AO key with any rcvid/sndid
+ * for the remote peer. On TCP-AO established connection
+ * the last key is impossible to remove, so there's
+ * always at least one current_key.
+ */
+ if (tcp_ao_required(sk, saddr, family, l3index, true)) {
+ trace_tcp_hash_ao_required(sk, skb);
+ return SKB_DROP_REASON_TCP_AONOTFOUND;
+ }
+ if (unlikely(tcp_md5_do_lookup(sk, l3index, saddr, family))) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
+ trace_tcp_hash_md5_required(sk, skb);
+ return SKB_DROP_REASON_TCP_MD5NOTFOUND;
+ }
+ return SKB_NOT_DROPPED_YET;
+ }
+
+ if (aoh)
+ return tcp_inbound_ao_hash(sk, skb, family, req, l3index, aoh);
+
+ return tcp_inbound_md5_hash(sk, skb, saddr, daddr, family,
+ l3index, md5_location);
+}
+EXPORT_SYMBOL_GPL(tcp_inbound_hash);
+
void tcp_done(struct sock *sk)
{
struct request_sock *req;
@@ -4542,6 +4898,13 @@ int tcp_abort(struct sock *sk, int err)
/* Don't race with userspace socket closes such as tcp_close. */
lock_sock(sk);
+ /* Avoid closing the same socket twice. */
+ if (sk->sk_state == TCP_CLOSE) {
+ if (!has_current_bpf_ctx())
+ release_sock(sk);
+ return -ENOENT;
+ }
+
if (sk->sk_state == TCP_LISTEN) {
tcp_set_state(sk, TCP_CLOSE);
inet_csk_listen_stop(sk);
@@ -4551,19 +4914,13 @@ int tcp_abort(struct sock *sk, int err)
local_bh_disable();
bh_lock_sock(sk);
- if (!sock_flag(sk, SOCK_DEAD)) {
- WRITE_ONCE(sk->sk_err, err);
- /* This barrier is coupled with smp_rmb() in tcp_poll() */
- smp_wmb();
- sk_error_report(sk);
- if (tcp_need_reset(sk->sk_state))
- tcp_send_active_reset(sk, GFP_ATOMIC);
- tcp_done(sk);
- }
+ if (tcp_need_reset(sk->sk_state))
+ tcp_send_active_reset(sk, GFP_ATOMIC,
+ SK_RST_REASON_TCP_STATE);
+ tcp_done_with_error(sk, err);
bh_unlock_sock(sk);
local_bh_enable();
- tcp_write_queue_purge(sk);
if (!has_current_bpf_ctx())
release_sock(sk);
return 0;
@@ -4648,16 +5005,16 @@ static void __init tcp_struct_check(void)
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, lsndtime);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, mdev_us);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tcp_wstamp_ns);
- CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tcp_clock_cache);
- CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tcp_mstamp);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, rtt_seq);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tsorted_sent_queue);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, highest_sack);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, ecn_flags);
- CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_tx, 105);
+ CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_tx, 89);
/* TXRX read-write hotpath cache lines */
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, pred_flags);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, tcp_clock_cache);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, tcp_mstamp);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_nxt);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_nxt);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_una);
@@ -4670,7 +5027,11 @@ static void __init tcp_struct_check(void)
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, app_limited);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_wnd);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rx_opt);
- CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 76);
+
+ /* 32bit arches with 8byte alignment on u64 fields might need padding
+ * before tcp_clock_cache.
+ */
+ CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 92 + 4);
/* RX read-write hotpath cache lines */
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_received);
diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c
index 3afeeb68e8a7..db6516092daf 100644
--- a/net/ipv4/tcp_ao.c
+++ b/net/ipv4/tcp_ao.c
@@ -16,6 +16,7 @@
#include <net/tcp.h>
#include <net/ipv6.h>
#include <net/icmp.h>
+#include <trace/events/tcp.h>
DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_ao_needed, HZ);
@@ -266,32 +267,49 @@ static void tcp_ao_key_free_rcu(struct rcu_head *head)
kfree_sensitive(key);
}
-void tcp_ao_destroy_sock(struct sock *sk, bool twsk)
+static void tcp_ao_info_free_rcu(struct rcu_head *head)
{
- struct tcp_ao_info *ao;
+ struct tcp_ao_info *ao = container_of(head, struct tcp_ao_info, rcu);
struct tcp_ao_key *key;
struct hlist_node *n;
+ hlist_for_each_entry_safe(key, n, &ao->head, node) {
+ hlist_del(&key->node);
+ tcp_sigpool_release(key->tcp_sigpool_id);
+ kfree_sensitive(key);
+ }
+ kfree(ao);
+ static_branch_slow_dec_deferred(&tcp_ao_needed);
+}
+
+static void tcp_ao_sk_omem_free(struct sock *sk, struct tcp_ao_info *ao)
+{
+ size_t total_ao_sk_mem = 0;
+ struct tcp_ao_key *key;
+
+ hlist_for_each_entry(key, &ao->head, node)
+ total_ao_sk_mem += tcp_ao_sizeof_key(key);
+ atomic_sub(total_ao_sk_mem, &sk->sk_omem_alloc);
+}
+
+void tcp_ao_destroy_sock(struct sock *sk, bool twsk)
+{
+ struct tcp_ao_info *ao;
+
if (twsk) {
ao = rcu_dereference_protected(tcp_twsk(sk)->ao_info, 1);
- tcp_twsk(sk)->ao_info = NULL;
+ rcu_assign_pointer(tcp_twsk(sk)->ao_info, NULL);
} else {
ao = rcu_dereference_protected(tcp_sk(sk)->ao_info, 1);
- tcp_sk(sk)->ao_info = NULL;
+ rcu_assign_pointer(tcp_sk(sk)->ao_info, NULL);
}
if (!ao || !refcount_dec_and_test(&ao->refcnt))
return;
- hlist_for_each_entry_safe(key, n, &ao->head, node) {
- hlist_del_rcu(&key->node);
- if (!twsk)
- atomic_sub(tcp_ao_sizeof_key(key), &sk->sk_omem_alloc);
- call_rcu(&key->rcu, tcp_ao_key_free_rcu);
- }
-
- kfree_rcu(ao, rcu);
- static_branch_slow_dec_deferred(&tcp_ao_needed);
+ if (!twsk)
+ tcp_ao_sk_omem_free(sk, ao);
+ call_rcu(&ao->rcu, tcp_ao_info_free_rcu);
}
void tcp_ao_time_wait(struct tcp_timewait_sock *tcptw, struct tcp_sock *tp)
@@ -884,17 +902,16 @@ tcp_ao_verify_hash(const struct sock *sk, const struct sk_buff *skb,
const struct tcp_ao_hdr *aoh, struct tcp_ao_key *key,
u8 *traffic_key, u8 *phash, u32 sne, int l3index)
{
- u8 maclen = aoh->length - sizeof(struct tcp_ao_hdr);
const struct tcphdr *th = tcp_hdr(skb);
+ u8 maclen = tcp_ao_hdr_maclen(aoh);
void *hash_buf = NULL;
if (maclen != tcp_ao_maclen(key)) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOBAD);
atomic64_inc(&info->counters.pkt_bad);
atomic64_inc(&key->pkt_bad);
- tcp_hash_fail("AO hash wrong length", family, skb,
- "%u != %d L3index: %d", maclen,
- tcp_ao_maclen(key), l3index);
+ trace_tcp_ao_wrong_maclen(sk, skb, aoh->keyid,
+ aoh->rnext_keyid, maclen);
return SKB_DROP_REASON_TCP_AOFAILURE;
}
@@ -909,8 +926,8 @@ tcp_ao_verify_hash(const struct sock *sk, const struct sk_buff *skb,
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOBAD);
atomic64_inc(&info->counters.pkt_bad);
atomic64_inc(&key->pkt_bad);
- tcp_hash_fail("AO hash mismatch", family, skb,
- "L3index: %d", l3index);
+ trace_tcp_ao_mismatch(sk, skb, aoh->keyid,
+ aoh->rnext_keyid, maclen);
kfree(hash_buf);
return SKB_DROP_REASON_TCP_AOFAILURE;
}
@@ -927,19 +944,21 @@ tcp_inbound_ao_hash(struct sock *sk, const struct sk_buff *skb,
int l3index, const struct tcp_ao_hdr *aoh)
{
const struct tcphdr *th = tcp_hdr(skb);
+ u8 maclen = tcp_ao_hdr_maclen(aoh);
u8 *phash = (u8 *)(aoh + 1); /* hash goes just after the header */
struct tcp_ao_info *info;
enum skb_drop_reason ret;
struct tcp_ao_key *key;
__be32 sisn, disn;
u8 *traffic_key;
+ int state;
u32 sne = 0;
info = rcu_dereference(tcp_sk(sk)->ao_info);
if (!info) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOKEYNOTFOUND);
- tcp_hash_fail("AO key not found", family, skb,
- "keyid: %u L3index: %d", aoh->keyid, l3index);
+ trace_tcp_ao_key_not_found(sk, skb, aoh->keyid,
+ aoh->rnext_keyid, maclen);
return SKB_DROP_REASON_TCP_AOUNEXPECTED;
}
@@ -948,8 +967,9 @@ tcp_inbound_ao_hash(struct sock *sk, const struct sk_buff *skb,
disn = 0;
}
+ state = READ_ONCE(sk->sk_state);
/* Fast-path */
- if (likely((1 << sk->sk_state) & TCP_AO_ESTABLISHED)) {
+ if (likely((1 << state) & TCP_AO_ESTABLISHED)) {
enum skb_drop_reason err;
struct tcp_ao_key *current_key;
@@ -979,6 +999,9 @@ tcp_inbound_ao_hash(struct sock *sk, const struct sk_buff *skb,
current_key = READ_ONCE(info->current_key);
/* Key rotation: the peer asks us to use new key (RNext) */
if (unlikely(aoh->rnext_keyid != current_key->sndid)) {
+ trace_tcp_ao_rnext_request(sk, skb, current_key->sndid,
+ aoh->rnext_keyid,
+ tcp_ao_hdr_maclen(aoh));
/* If the key is not found we do nothing. */
key = tcp_ao_established_key(info, aoh->rnext_keyid, -1);
if (key)
@@ -988,6 +1011,9 @@ tcp_inbound_ao_hash(struct sock *sk, const struct sk_buff *skb,
return SKB_NOT_DROPPED_YET;
}
+ if (unlikely(state == TCP_CLOSE))
+ return SKB_DROP_REASON_TCP_CLOSE;
+
/* Lookup key based on peer address and keyid.
* current_key and rnext_key must not be used on tcp listen
* sockets as otherwise:
@@ -1001,7 +1027,7 @@ tcp_inbound_ao_hash(struct sock *sk, const struct sk_buff *skb,
if (th->syn && !th->ack)
goto verify_hash;
- if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_NEW_SYN_RECV)) {
+ if ((1 << state) & (TCPF_LISTEN | TCPF_NEW_SYN_RECV)) {
/* Make the initial syn the likely case here */
if (unlikely(req)) {
sne = tcp_ao_compute_sne(0, tcp_rsk(req)->rcv_isn,
@@ -1018,14 +1044,14 @@ tcp_inbound_ao_hash(struct sock *sk, const struct sk_buff *skb,
/* no way to figure out initial sisn/disn - drop */
return SKB_DROP_REASON_TCP_FLAGS;
}
- } else if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
+ } else if ((1 << state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
disn = info->lisn;
if (th->syn || th->rst)
sisn = th->seq;
else
sisn = info->risn;
} else {
- WARN_ONCE(1, "TCP-AO: Unexpected sk_state %d", sk->sk_state);
+ WARN_ONCE(1, "TCP-AO: Unexpected sk_state %d", state);
return SKB_DROP_REASON_TCP_AOFAILURE;
}
verify_hash:
@@ -1041,8 +1067,8 @@ verify_hash:
key_not_found:
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOKEYNOTFOUND);
atomic64_inc(&info->counters.key_not_found);
- tcp_hash_fail("Requested by the peer AO key id not found",
- family, skb, "L3index: %d", l3index);
+ trace_tcp_ao_key_not_found(sk, skb, aoh->keyid,
+ aoh->rnext_keyid, maclen);
return SKB_DROP_REASON_TCP_AOKEYNOTFOUND;
}
@@ -1068,6 +1094,7 @@ void tcp_ao_connect_init(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_ao_info *ao_info;
+ struct hlist_node *next;
union tcp_ao_addr *addr;
struct tcp_ao_key *key;
int family, l3index;
@@ -1090,7 +1117,7 @@ void tcp_ao_connect_init(struct sock *sk)
l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
sk->sk_bound_dev_if);
- hlist_for_each_entry_rcu(key, &ao_info->head, node) {
+ hlist_for_each_entry_safe(key, next, &ao_info->head, node) {
if (!tcp_ao_key_cmp(key, l3index, addr, key->prefixlen, family, -1, -1))
continue;
@@ -1962,8 +1989,10 @@ static int tcp_ao_info_cmd(struct sock *sk, unsigned short int family,
first = true;
}
- if (cmd.ao_required && tcp_ao_required_verify(sk))
- return -EKEYREJECTED;
+ if (cmd.ao_required && tcp_ao_required_verify(sk)) {
+ err = -EKEYREJECTED;
+ goto out;
+ }
/* For sockets in TCP_CLOSED it's possible set keys that aren't
* matching the future peer (address/port/VRF/etc),
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index 05dc2d05bc7c..760941e55153 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -1024,7 +1024,7 @@ static void bbr_update_model(struct sock *sk, const struct rate_sample *rs)
bbr_update_gains(sk);
}
-__bpf_kfunc static void bbr_main(struct sock *sk, const struct rate_sample *rs)
+__bpf_kfunc static void bbr_main(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs)
{
struct bbr *bbr = inet_csk_ca(sk);
u32 bw;
@@ -1156,8 +1156,6 @@ static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = {
};
BTF_KFUNCS_START(tcp_bbr_check_kfunc_ids)
-#ifdef CONFIG_X86
-#ifdef CONFIG_DYNAMIC_FTRACE
BTF_ID_FLAGS(func, bbr_init)
BTF_ID_FLAGS(func, bbr_main)
BTF_ID_FLAGS(func, bbr_sndbuf_expand)
@@ -1166,8 +1164,6 @@ BTF_ID_FLAGS(func, bbr_cwnd_event)
BTF_ID_FLAGS(func, bbr_ssthresh)
BTF_ID_FLAGS(func, bbr_min_tso_segs)
BTF_ID_FLAGS(func, bbr_set_state)
-#endif
-#endif
BTF_KFUNCS_END(tcp_bbr_check_kfunc_ids)
static const struct btf_kfunc_id_set tcp_bbr_kfunc_set = {
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index 53b0d62fd2c2..e7658c5d6b79 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -30,7 +30,7 @@ void tcp_eat_skb(struct sock *sk, struct sk_buff *skb)
}
static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock,
- struct sk_msg *msg, u32 apply_bytes, int flags)
+ struct sk_msg *msg, u32 apply_bytes)
{
bool apply = apply_bytes;
struct scatterlist *sge;
@@ -167,7 +167,7 @@ int tcp_bpf_sendmsg_redir(struct sock *sk, bool ingress,
if (unlikely(!psock))
return -EPIPE;
- ret = ingress ? bpf_tcp_ingress(sk, psock, msg, bytes, flags) :
+ ret = ingress ? bpf_tcp_ingress(sk, psock, msg, bytes) :
tcp_bpf_push_locked(sk, msg, bytes, flags, false);
sk_psock_put(sk, psock);
return ret;
@@ -577,7 +577,7 @@ out_err:
err = sk_stream_error(sk, msg->msg_flags, err);
release_sock(sk);
sk_psock_put(sk, psock);
- return copied ? copied : err;
+ return copied > 0 ? copied : err;
}
enum {
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 28ffcfbeef14..0306d257fa64 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -46,8 +46,7 @@ void tcp_set_ca_state(struct sock *sk, const u8 ca_state)
}
/* Must be called with rcu lock held */
-static struct tcp_congestion_ops *tcp_ca_find_autoload(struct net *net,
- const char *name)
+static struct tcp_congestion_ops *tcp_ca_find_autoload(const char *name)
{
struct tcp_congestion_ops *ca = tcp_ca_find(name);
@@ -178,7 +177,7 @@ int tcp_update_congestion_control(struct tcp_congestion_ops *ca, struct tcp_cong
return ret;
}
-u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca)
+u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca)
{
const struct tcp_congestion_ops *ca;
u32 key = TCP_CA_UNSPEC;
@@ -186,7 +185,7 @@ u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca)
might_sleep();
rcu_read_lock();
- ca = tcp_ca_find_autoload(net, name);
+ ca = tcp_ca_find_autoload(name);
if (ca) {
key = ca->key;
*ecn_ca = ca->flags & TCP_CONG_NEEDS_ECN;
@@ -203,9 +202,10 @@ char *tcp_ca_get_name_by_key(u32 key, char *buffer)
rcu_read_lock();
ca = tcp_ca_find_key(key);
- if (ca)
- ret = strncpy(buffer, ca->name,
- TCP_CA_NAME_MAX);
+ if (ca) {
+ strscpy(buffer, ca->name, TCP_CA_NAME_MAX);
+ ret = buffer;
+ }
rcu_read_unlock();
return ret;
@@ -283,7 +283,7 @@ int tcp_set_default_congestion_control(struct net *net, const char *name)
int ret;
rcu_read_lock();
- ca = tcp_ca_find_autoload(net, name);
+ ca = tcp_ca_find_autoload(name);
if (!ca) {
ret = -ENOENT;
} else if (!bpf_try_module_get(ca, ca->owner)) {
@@ -338,7 +338,7 @@ void tcp_get_default_congestion_control(struct net *net, char *name)
rcu_read_lock();
ca = rcu_dereference(net->ipv4.tcp_congestion_control);
- strncpy(name, ca->name, TCP_CA_NAME_MAX);
+ strscpy(name, ca->name, TCP_CA_NAME_MAX);
rcu_read_unlock();
}
@@ -421,7 +421,7 @@ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load,
if (!load)
ca = tcp_ca_find(name);
else
- ca = tcp_ca_find_autoload(sock_net(sk), name);
+ ca = tcp_ca_find_autoload(name);
/* No change asking for existing value */
if (ca == icsk->icsk_ca_ops) {
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index 44869ea089e3..5dbed91c6178 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -486,16 +486,12 @@ static struct tcp_congestion_ops cubictcp __read_mostly = {
};
BTF_KFUNCS_START(tcp_cubic_check_kfunc_ids)
-#ifdef CONFIG_X86
-#ifdef CONFIG_DYNAMIC_FTRACE
BTF_ID_FLAGS(func, cubictcp_init)
BTF_ID_FLAGS(func, cubictcp_recalc_ssthresh)
BTF_ID_FLAGS(func, cubictcp_cong_avoid)
BTF_ID_FLAGS(func, cubictcp_state)
BTF_ID_FLAGS(func, cubictcp_cwnd_event)
BTF_ID_FLAGS(func, cubictcp_acked)
-#endif
-#endif
BTF_KFUNCS_END(tcp_cubic_check_kfunc_ids)
static const struct btf_kfunc_id_set tcp_cubic_kfunc_set = {
diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
index e33fbe4933e4..8a45a4aea933 100644
--- a/net/ipv4/tcp_dctcp.c
+++ b/net/ipv4/tcp_dctcp.c
@@ -58,7 +58,18 @@ struct dctcp {
};
static unsigned int dctcp_shift_g __read_mostly = 4; /* g = 1/2^4 */
-module_param(dctcp_shift_g, uint, 0644);
+
+static int dctcp_shift_g_set(const char *val, const struct kernel_param *kp)
+{
+ return param_set_uint_minmax(val, kp, 0, 10);
+}
+
+static const struct kernel_param_ops dctcp_shift_g_ops = {
+ .set = dctcp_shift_g_set,
+ .get = param_get_uint,
+};
+
+module_param_cb(dctcp_shift_g, &dctcp_shift_g_ops, &dctcp_shift_g, 0644);
MODULE_PARM_DESC(dctcp_shift_g, "parameter g for updating dctcp_alpha");
static unsigned int dctcp_alpha_on_init __read_mostly = DCTCP_MAX_ALPHA;
@@ -261,16 +272,12 @@ static struct tcp_congestion_ops dctcp_reno __read_mostly = {
};
BTF_KFUNCS_START(tcp_dctcp_check_kfunc_ids)
-#ifdef CONFIG_X86
-#ifdef CONFIG_DYNAMIC_FTRACE
BTF_ID_FLAGS(func, dctcp_init)
BTF_ID_FLAGS(func, dctcp_update_alpha)
BTF_ID_FLAGS(func, dctcp_cwnd_event)
BTF_ID_FLAGS(func, dctcp_ssthresh)
BTF_ID_FLAGS(func, dctcp_cwnd_undo)
BTF_ID_FLAGS(func, dctcp_state)
-#endif
-#endif
BTF_KFUNCS_END(tcp_dctcp_check_kfunc_ids)
static const struct btf_kfunc_id_set tcp_dctcp_kfunc_set = {
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 8ed54e7334a9..0f523cbfe329 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -49,7 +49,7 @@ void tcp_fastopen_ctx_destroy(struct net *net)
{
struct tcp_fastopen_context *ctxt;
- ctxt = xchg((__force struct tcp_fastopen_context **)&net->ipv4.tcp_fastopen_ctx, NULL);
+ ctxt = unrcu_pointer(xchg(&net->ipv4.tcp_fastopen_ctx, NULL));
if (ctxt)
call_rcu(&ctxt->rcu, tcp_fastopen_ctx_free);
@@ -80,9 +80,10 @@ int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk,
if (sk) {
q = &inet_csk(sk)->icsk_accept_queue.fastopenq;
- octx = xchg((__force struct tcp_fastopen_context **)&q->ctx, ctx);
+ octx = unrcu_pointer(xchg(&q->ctx, RCU_INITIALIZER(ctx)));
} else {
- octx = xchg((__force struct tcp_fastopen_context **)&net->ipv4.tcp_fastopen_ctx, ctx);
+ octx = unrcu_pointer(xchg(&net->ipv4.tcp_fastopen_ctx,
+ RCU_INITIALIZER(ctx)));
}
if (octx)
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index 52b1f2665dfa..81b96331b2bb 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -185,7 +185,7 @@ static inline void htcp_alpha_update(struct htcp *ca)
u32 scale = (HZ << 3) / (10 * minRTT);
/* clamping ratio to interval [0.5,10]<<3 */
- scale = min(max(scale, 1U << 2), 10U << 3);
+ scale = clamp(scale, 1U << 2, 10U << 3);
factor = (factor << 3) / scale;
if (!factor)
factor = 1;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 5d874817a78d..9f314dfa1490 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -72,6 +72,7 @@
#include <linux/prefetch.h>
#include <net/dst.h>
#include <net/tcp.h>
+#include <net/proto_memory.h>
#include <net/inet_common.h>
#include <linux/ipsec.h>
#include <asm/unaligned.h>
@@ -237,9 +238,14 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
*/
if (unlikely(len != icsk->icsk_ack.rcv_mss)) {
u64 val = (u64)skb->len << TCP_RMEM_TO_WIN_SCALE;
+ u8 old_ratio = tcp_sk(sk)->scaling_ratio;
do_div(val, skb->truesize);
tcp_sk(sk)->scaling_ratio = val ? val : 1;
+
+ if (old_ratio != tcp_sk(sk)->scaling_ratio)
+ WRITE_ONCE(tcp_sk(sk)->window_clamp,
+ tcp_win_from_space(sk, sk->sk_rcvbuf));
}
icsk->icsk_ack.rcv_mss = min_t(unsigned int, len,
tcp_sk(sk)->advmss);
@@ -563,19 +569,20 @@ static void tcp_init_buffer_space(struct sock *sk)
maxwin = tcp_full_space(sk);
if (tp->window_clamp >= maxwin) {
- tp->window_clamp = maxwin;
+ WRITE_ONCE(tp->window_clamp, maxwin);
if (tcp_app_win && maxwin > 4 * tp->advmss)
- tp->window_clamp = max(maxwin -
- (maxwin >> tcp_app_win),
- 4 * tp->advmss);
+ WRITE_ONCE(tp->window_clamp,
+ max(maxwin - (maxwin >> tcp_app_win),
+ 4 * tp->advmss));
}
/* Force reservation of one segment. */
if (tcp_app_win &&
tp->window_clamp > 2 * tp->advmss &&
tp->window_clamp + tp->advmss > maxwin)
- tp->window_clamp = max(2 * tp->advmss, maxwin - tp->advmss);
+ WRITE_ONCE(tp->window_clamp,
+ max(2 * tp->advmss, maxwin - tp->advmss));
tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp);
tp->snd_cwnd_stamp = tcp_jiffies32;
@@ -773,7 +780,8 @@ void tcp_rcv_space_adjust(struct sock *sk)
WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
/* Make the window clamp follow along. */
- tp->window_clamp = tcp_win_from_space(sk, rcvbuf);
+ WRITE_ONCE(tp->window_clamp,
+ tcp_win_from_space(sk, rcvbuf));
}
}
tp->rcvq_space.space = copied;
@@ -911,7 +919,7 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
tp->rtt_seq = tp->snd_nxt;
tp->mdev_max_us = tcp_rto_min_us(sk);
- tcp_bpf_rtt(sk);
+ tcp_bpf_rtt(sk, mrtt_us, srtt);
}
} else {
/* no previous measure. */
@@ -921,7 +929,7 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
tp->mdev_max_us = tp->rttvar_us;
tp->rtt_seq = tp->snd_nxt;
- tcp_bpf_rtt(sk);
+ tcp_bpf_rtt(sk, mrtt_us, srtt);
}
tp->srtt_us = max(1U, srtt);
}
@@ -2126,8 +2134,16 @@ void tcp_clear_retrans(struct tcp_sock *tp)
static inline void tcp_init_undo(struct tcp_sock *tp)
{
tp->undo_marker = tp->snd_una;
+
/* Retransmission still in flight may cause DSACKs later. */
- tp->undo_retrans = tp->retrans_out ? : -1;
+ /* First, account for regular retransmits in flight: */
+ tp->undo_retrans = tp->retrans_out;
+ /* Next, account for TLP retransmits in flight: */
+ if (tp->tlp_high_seq && tp->tlp_retrans)
+ tp->undo_retrans++;
+ /* Finally, avoid 0, because undo_retrans==0 means "can undo now": */
+ if (!tp->undo_retrans)
+ tp->undo_retrans = -1;
}
static bool tcp_is_rack(const struct sock *sk)
@@ -2206,6 +2222,7 @@ void tcp_enter_loss(struct sock *sk)
tcp_set_ca_state(sk, TCP_CA_Loss);
tp->high_seq = tp->snd_nxt;
+ tp->tlp_high_seq = 0;
tcp_ecn_queue_cwr(tp);
/* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous
@@ -2779,13 +2796,37 @@ static void tcp_mtup_probe_success(struct sock *sk)
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMTUPSUCCESS);
}
+/* Sometimes we deduce that packets have been dropped due to reasons other than
+ * congestion, like path MTU reductions or failed client TFO attempts. In these
+ * cases we call this function to retransmit as many packets as cwnd allows,
+ * without reducing cwnd. Given that retransmits will set retrans_stamp to a
+ * non-zero value (and may do so in a later calling context due to TSQ), we
+ * also enter CA_Loss so that we track when all retransmitted packets are ACKed
+ * and clear retrans_stamp when that happens (to ensure later recurring RTOs
+ * are using the correct retrans_stamp and don't declare ETIMEDOUT
+ * prematurely).
+ */
+static void tcp_non_congestion_loss_retransmit(struct sock *sk)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (icsk->icsk_ca_state != TCP_CA_Loss) {
+ tp->high_seq = tp->snd_nxt;
+ tp->snd_ssthresh = tcp_current_ssthresh(sk);
+ tp->prior_ssthresh = 0;
+ tp->undo_marker = 0;
+ tcp_set_ca_state(sk, TCP_CA_Loss);
+ }
+ tcp_xmit_retransmit_queue(sk);
+}
+
/* Do a simple retransmit without using the backoff mechanisms in
* tcp_timer. This is used for path mtu discovery.
* The socket is already locked here.
*/
void tcp_simple_retransmit(struct sock *sk)
{
- const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
int mss;
@@ -2825,14 +2866,7 @@ void tcp_simple_retransmit(struct sock *sk)
* in network, but units changed and effective
* cwnd/ssthresh really reduced now.
*/
- if (icsk->icsk_ca_state != TCP_CA_Loss) {
- tp->high_seq = tp->snd_nxt;
- tp->snd_ssthresh = tcp_current_ssthresh(sk);
- tp->prior_ssthresh = 0;
- tp->undo_marker = 0;
- tcp_set_ca_state(sk, TCP_CA_Loss);
- }
- tcp_xmit_retransmit_queue(sk);
+ tcp_non_congestion_loss_retransmit(sk);
}
EXPORT_SYMBOL(tcp_simple_retransmit);
@@ -3057,7 +3091,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
return;
if (tcp_try_undo_dsack(sk))
- tcp_try_keep_open(sk);
+ tcp_try_to_open(sk, flag);
tcp_identify_packet_loss(sk, ack_flag);
if (icsk->icsk_ca_state != TCP_CA_Recovery) {
@@ -3539,7 +3573,7 @@ static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked,
const struct inet_connection_sock *icsk = inet_csk(sk);
if (icsk->icsk_ca_ops->cong_control) {
- icsk->icsk_ca_ops->cong_control(sk, rs);
+ icsk->icsk_ca_ops->cong_control(sk, ack, flag, rs);
return;
}
@@ -3575,8 +3609,10 @@ static void tcp_snd_sne_update(struct tcp_sock *tp, u32 ack)
ao = rcu_dereference_protected(tp->ao_info,
lockdep_sock_is_held((struct sock *)tp));
- if (ao && ack < tp->snd_una)
+ if (ao && ack < tp->snd_una) {
ao->snd_sne++;
+ trace_tcp_ao_snd_sne_update((struct sock *)tp, ao->snd_sne);
+ }
#endif
}
@@ -3601,8 +3637,10 @@ static void tcp_rcv_sne_update(struct tcp_sock *tp, u32 seq)
ao = rcu_dereference_protected(tp->ao_info,
lockdep_sock_is_held((struct sock *)tp));
- if (ao && seq < tp->rcv_nxt)
+ if (ao && seq < tp->rcv_nxt) {
ao->rcv_sne++;
+ trace_tcp_ao_rcv_sne_update((struct sock *)tp, ao->rcv_sne);
+ }
#endif
}
@@ -4204,6 +4242,13 @@ void tcp_parse_options(const struct net *net,
*/
break;
#endif
+#ifdef CONFIG_TCP_AO
+ case TCPOPT_AO:
+ /* TCP AO has already been checked
+ * (see tcp_inbound_ao_hash()).
+ */
+ break;
+#endif
case TCPOPT_FASTOPEN:
tcp_parse_fastopen_option(
opsize - TCPOLEN_FASTOPEN_BASE,
@@ -4433,9 +4478,26 @@ static enum skb_drop_reason tcp_sequence(const struct tcp_sock *tp,
return SKB_NOT_DROPPED_YET;
}
+
+void tcp_done_with_error(struct sock *sk, int err)
+{
+ /* This barrier is coupled with smp_rmb() in tcp_poll() */
+ WRITE_ONCE(sk->sk_err, err);
+ smp_wmb();
+
+ tcp_write_queue_purge(sk);
+ tcp_done(sk);
+
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk_error_report(sk);
+}
+EXPORT_SYMBOL(tcp_done_with_error);
+
/* When we get a reset we do this. */
void tcp_reset(struct sock *sk, struct sk_buff *skb)
{
+ int err;
+
trace_tcp_receive_reset(sk);
/* mptcp can't tell us to ignore reset pkts,
@@ -4447,24 +4509,17 @@ void tcp_reset(struct sock *sk, struct sk_buff *skb)
/* We want the right error as BSD sees it (and indeed as we do). */
switch (sk->sk_state) {
case TCP_SYN_SENT:
- WRITE_ONCE(sk->sk_err, ECONNREFUSED);
+ err = ECONNREFUSED;
break;
case TCP_CLOSE_WAIT:
- WRITE_ONCE(sk->sk_err, EPIPE);
+ err = EPIPE;
break;
case TCP_CLOSE:
return;
default:
- WRITE_ONCE(sk->sk_err, ECONNRESET);
+ err = ECONNRESET;
}
- /* This barrier is coupled with smp_rmb() in tcp_poll() */
- smp_wmb();
-
- tcp_write_queue_purge(sk);
- tcp_done(sk);
-
- if (!sock_flag(sk, SOCK_DEAD))
- sk_error_report(sk);
+ tcp_done_with_error(sk, err);
}
/*
@@ -4800,13 +4855,8 @@ static bool tcp_try_coalesce(struct sock *sk,
if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq)
return false;
- if (!mptcp_skb_can_collapse(to, from))
- return false;
-
-#ifdef CONFIG_TLS_DEVICE
- if (from->decrypted != to->decrypted)
+ if (!tcp_skb_can_collapse_rx(to, from))
return false;
-#endif
if (!skb_try_coalesce(to, from, fragstolen, &delta))
return false;
@@ -4848,7 +4898,7 @@ static void tcp_drop_reason(struct sock *sk, struct sk_buff *skb,
enum skb_drop_reason reason)
{
sk_drops_add(sk, skb);
- kfree_skb_reason(skb, reason);
+ sk_skb_reason_drop(sk, skb, reason);
}
/* This one checks to see if we can put data from the
@@ -5174,6 +5224,16 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
*/
if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
if (tcp_receive_window(tp) == 0) {
+ /* Some stacks are known to send bare FIN packets
+ * in a loop even if we send RWIN 0 in our ACK.
+ * Accepting this FIN does not hurt memory pressure
+ * because the FIN flag will simply be merged to the
+ * receive queue tail skb in most cases.
+ */
+ if (!skb->len &&
+ (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
+ goto queue_and_out;
+
reason = SKB_DROP_REASON_TCP_ZEROWINDOW;
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP);
goto out_of_window;
@@ -5188,7 +5248,7 @@ queue_and_out:
inet_csk_schedule_ack(sk);
sk->sk_data_ready(sk);
- if (skb_queue_len(&sk->sk_receive_queue)) {
+ if (skb_queue_len(&sk->sk_receive_queue) && skb->len) {
reason = SKB_DROP_REASON_PROTO_MEM;
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP);
goto drop;
@@ -5331,6 +5391,9 @@ restart:
for (end_of_skbs = true; skb != NULL && skb != tail; skb = n) {
n = tcp_skb_next(skb, list);
+ if (!skb_frags_readable(skb))
+ goto skip_this;
+
/* No new bits? It is possible on ofo queue. */
if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
skb = tcp_collapse_one(sk, skb, list, root);
@@ -5351,17 +5414,20 @@ restart:
break;
}
- if (n && n != tail && mptcp_skb_can_collapse(skb, n) &&
+ if (n && n != tail && skb_frags_readable(n) &&
+ tcp_skb_can_collapse_rx(skb, n) &&
TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) {
end_of_skbs = false;
break;
}
+skip_this:
/* Decided to skip this, advance start seq. */
start = TCP_SKB_CB(skb)->end_seq;
}
if (end_of_skbs ||
- (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
+ (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) ||
+ !skb_frags_readable(skb))
return;
__skb_queue_head_init(&tmp);
@@ -5375,9 +5441,7 @@ restart:
break;
memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
-#ifdef CONFIG_TLS_DEVICE
- nskb->decrypted = skb->decrypted;
-#endif
+ skb_copy_decrypted(nskb, skb);
TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
if (list)
__skb_queue_before(list, skb, nskb);
@@ -5404,13 +5468,10 @@ restart:
skb = tcp_collapse_one(sk, skb, list, root);
if (!skb ||
skb == tail ||
- !mptcp_skb_can_collapse(nskb, skb) ||
- (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
+ !tcp_skb_can_collapse_rx(nskb, skb) ||
+ (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) ||
+ !skb_frags_readable(skb))
goto end;
-#ifdef CONFIG_TLS_DEVICE
- if (skb->decrypted != nskb->decrypted)
- goto end;
-#endif
}
}
}
@@ -5949,6 +6010,11 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
* RFC 5961 4.2 : Send a challenge ack
*/
if (th->syn) {
+ if (sk->sk_state == TCP_SYN_RECV && sk->sk_socket && th->ack &&
+ TCP_SKB_CB(skb)->seq + 1 == TCP_SKB_CB(skb)->end_seq &&
+ TCP_SKB_CB(skb)->seq + 1 == tp->rcv_nxt &&
+ TCP_SKB_CB(skb)->ack_seq == tp->snd_nxt)
+ goto pass;
syn_challenge:
if (syn_inerr)
TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
@@ -5958,6 +6024,7 @@ syn_challenge:
goto discard;
}
+pass:
bpf_skops_parse_hdr(sk, skb);
return true;
@@ -6288,7 +6355,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
tp->fastopen_client_fail = TFO_DATA_NOT_ACKED;
skb_rbtree_walk_from(data)
tcp_mark_skb_lost(sk, data);
- tcp_xmit_retransmit_queue(sk);
+ tcp_non_congestion_loss_retransmit(sk);
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPFASTOPENACTIVEFAIL);
return true;
@@ -6426,7 +6493,8 @@ consume:
if (!tp->rx_opt.wscale_ok) {
tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0;
- tp->window_clamp = min(tp->window_clamp, 65535U);
+ WRITE_ONCE(tp->window_clamp,
+ min(tp->window_clamp, 65535U));
}
if (tp->rx_opt.saw_tstamp) {
@@ -6761,6 +6829,8 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
tcp_initialize_rcv_mss(sk);
tcp_fast_path_on(tp);
+ if (sk->sk_shutdown & SEND_SHUTDOWN)
+ tcp_shutdown(sk, SEND_SHUTDOWN);
break;
case TCP_FIN_WAIT1: {
@@ -6971,35 +7041,10 @@ static void tcp_openreq_init(struct request_sock *req,
#endif
}
-struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
- struct sock *sk_listener,
- bool attach_listener)
-{
- struct request_sock *req = reqsk_alloc(ops, sk_listener,
- attach_listener);
-
- if (req) {
- struct inet_request_sock *ireq = inet_rsk(req);
-
- ireq->ireq_opt = NULL;
-#if IS_ENABLED(CONFIG_IPV6)
- ireq->pktopts = NULL;
-#endif
- atomic64_set(&ireq->ir_cookie, 0);
- ireq->ireq_state = TCP_NEW_SYN_RECV;
- write_pnet(&ireq->ireq_net, sock_net(sk_listener));
- ireq->ireq_family = sk_listener->sk_family;
- req->timeout = TCP_TIMEOUT_INIT;
- }
-
- return req;
-}
-EXPORT_SYMBOL(inet_reqsk_alloc);
-
/*
* Return true if a syncookie should be sent
*/
-static bool tcp_syn_flood_action(const struct sock *sk, const char *proto)
+static bool tcp_syn_flood_action(struct sock *sk, const char *proto)
{
struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
const char *msg = "Dropping request";
@@ -7100,7 +7145,6 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
struct sock *sk, struct sk_buff *skb)
{
struct tcp_fastopen_cookie foc = { .len = -1 };
- __u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn;
struct tcp_options_received tmp_opt;
struct tcp_sock *tp = tcp_sk(sk);
struct net *net = sock_net(sk);
@@ -7110,21 +7154,28 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
struct dst_entry *dst;
struct flowi fl;
u8 syncookies;
+ u32 isn;
#ifdef CONFIG_TCP_AO
const struct tcp_ao_hdr *aoh;
#endif
- syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies);
+ isn = __this_cpu_read(tcp_tw_isn);
+ if (isn) {
+ /* TW buckets are converted to open requests without
+ * limitations, they conserve resources and peer is
+ * evidently real one.
+ */
+ __this_cpu_write(tcp_tw_isn, 0);
+ } else {
+ syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies);
- /* TW buckets are converted to open requests without
- * limitations, they conserve resources and peer is
- * evidently real one.
- */
- if ((syncookies == 2 || inet_csk_reqsk_queue_is_full(sk)) && !isn) {
- want_cookie = tcp_syn_flood_action(sk, rsk_ops->slab_name);
- if (!want_cookie)
- goto drop;
+ if (syncookies == 2 || inet_csk_reqsk_queue_is_full(sk)) {
+ want_cookie = tcp_syn_flood_action(sk,
+ rsk_ops->slab_name);
+ if (!want_cookie)
+ goto drop;
+ }
}
if (sk_acceptq_is_full(sk)) {
@@ -7163,7 +7214,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
/* Note: tcp_v6_init_req() might override ir_iif for link locals */
inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb);
- dst = af_ops->route_req(sk, skb, &fl, req);
+ dst = af_ops->route_req(sk, skb, &fl, req, isn);
if (!dst)
goto drop_and_free;
@@ -7240,7 +7291,12 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
tcp_rsk(req)->tfo_listener = false;
if (!want_cookie) {
req->timeout = tcp_timeout_init((struct sock *)req);
- inet_csk_reqsk_queue_hash_add(sk, req, req->timeout);
+ if (unlikely(!inet_csk_reqsk_queue_hash_add(sk, req,
+ req->timeout))) {
+ reqsk_free(req);
+ return 0;
+ }
+
}
af_ops->send_synack(sk, dst, &fl, req, &foc,
!want_cookie ? TCP_SYNACK_NORMAL :
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index a22ee5838751..5afe5e57c89b 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -70,6 +70,7 @@
#include <net/xfrm.h>
#include <net/secure_seq.h>
#include <net/busy_poll.h>
+#include <net/rstreason.h>
#include <linux/inet.h>
#include <linux/ipv6.h>
@@ -78,6 +79,7 @@
#include <linux/seq_file.h>
#include <linux/inetdevice.h>
#include <linux/btf_ids.h>
+#include <linux/skbuff_ref.h>
#include <crypto/hash.h>
#include <linux/scatterlist.h>
@@ -92,7 +94,11 @@ static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
struct inet_hashinfo tcp_hashinfo;
EXPORT_SYMBOL(tcp_hashinfo);
-static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
+static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = {
+ .bh_lock = INIT_LOCAL_LOCK(bh_lock),
+};
+
+static DEFINE_MUTEX(tcp_exit_batch_mutex);
static u32 tcp_v4_init_seq(const struct sk_buff *skb)
{
@@ -113,6 +119,10 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
const struct inet_timewait_sock *tw = inet_twsk(sktw);
const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
struct tcp_sock *tp = tcp_sk(sk);
+ int ts_recent_stamp;
+
+ if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2)
+ reuse = 0;
if (reuse == 2) {
/* Still does not detect *everything* that goes through
@@ -151,9 +161,16 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
If TW bucket has been already destroyed we fall back to VJ's scheme
and use initial timestamp retrieved from peer table.
*/
- if (tcptw->tw_ts_recent_stamp &&
+ ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp);
+ if (ts_recent_stamp &&
(!twp || (reuse && time_after32(ktime_get_seconds(),
- tcptw->tw_ts_recent_stamp)))) {
+ ts_recent_stamp)))) {
+ /* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk
+ * and releasing the bucket lock.
+ */
+ if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt)))
+ return 0;
+
/* In case of repair and re-using TIME-WAIT sockets we still
* want to be sure that it is safe as above but honor the
* sequence numbers and time stamps set as part of the repair
@@ -171,10 +188,10 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
if (!seq)
seq = 1;
WRITE_ONCE(tp->write_seq, seq);
- tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
- tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
+ tp->rx_opt.ts_recent = READ_ONCE(tcptw->tw_ts_recent);
+ tp->rx_opt.ts_recent_stamp = ts_recent_stamp;
}
- sock_hold(sktw);
+
return 1;
}
@@ -604,15 +621,10 @@ int tcp_v4_err(struct sk_buff *skb, u32 info)
ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
- if (!sock_owned_by_user(sk)) {
- WRITE_ONCE(sk->sk_err, err);
-
- sk_error_report(sk);
-
- tcp_done(sk);
- } else {
+ if (!sock_owned_by_user(sk))
+ tcp_done_with_error(sk, err);
+ else
WRITE_ONCE(sk->sk_err_soft, err);
- }
goto out;
}
@@ -723,7 +735,8 @@ out:
* Exception: precedence violation. We do not implement it in any case.
*/
-static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
+static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
+ enum sk_rst_reason reason)
{
const struct tcphdr *th = tcp_hdr(skb);
struct {
@@ -866,11 +879,10 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
* routing might fail in this case. No choice here, if we choose to force
* input interface, we will misroute in case of asymmetric route.
*/
- if (sk) {
+ if (sk)
arg.bound_dev_if = sk->sk_bound_dev_if;
- if (sk_fullsock(sk))
- trace_tcp_send_reset(sk, skb);
- }
+
+ trace_tcp_send_reset(sk, skb, reason);
BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
offsetof(struct inet_timewait_sock, tw_bound_dev_if));
@@ -878,7 +890,9 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
arg.tos = ip_hdr(skb)->tos;
arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
local_bh_disable();
- ctl_sk = this_cpu_read(ipv4_tcp_sk);
+ local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
+ ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
+
sock_net_set(ctl_sk, net);
if (sk) {
ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
@@ -903,6 +917,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
sock_net_set(ctl_sk, &init_net);
__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
+ local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
local_bh_enable();
#ifdef CONFIG_TCP_MD5SIG
@@ -998,7 +1013,8 @@ static void tcp_v4_send_ack(const struct sock *sk,
arg.tos = tos;
arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
local_bh_disable();
- ctl_sk = this_cpu_read(ipv4_tcp_sk);
+ local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
+ ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
sock_net_set(ctl_sk, net);
ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
@@ -1013,6 +1029,7 @@ static void tcp_v4_send_ack(const struct sock *sk,
sock_net_set(ctl_sk, &init_net);
__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
+ local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
local_bh_enable();
}
@@ -1050,19 +1067,17 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
#else
if (0) {
#endif
-#ifdef CONFIG_TCP_MD5SIG
- } else if (static_branch_unlikely(&tcp_md5_needed.key)) {
+ } else if (static_branch_tcp_md5()) {
key.md5_key = tcp_twsk_md5_key(tcptw);
if (key.md5_key)
key.type = TCP_KEY_MD5;
-#endif
}
tcp_v4_send_ack(sk, skb,
- tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
+ tcptw->tw_snd_nxt, READ_ONCE(tcptw->tw_rcv_nxt),
tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
tcp_tw_tsval(tcptw),
- tcptw->tw_ts_recent,
+ READ_ONCE(tcptw->tw_ts_recent),
tw->tw_bound_dev_if, &key,
tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
tw->tw_tos,
@@ -1124,8 +1139,7 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
#else
if (0) {
#endif
-#ifdef CONFIG_TCP_MD5SIG
- } else if (static_branch_unlikely(&tcp_md5_needed.key)) {
+ } else if (static_branch_tcp_md5()) {
const union tcp_md5_addr *addr;
int l3index;
@@ -1134,17 +1148,11 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
if (key.md5_key)
key.type = TCP_KEY_MD5;
-#endif
}
- /* RFC 7323 2.3
- * The window field (SEG.WND) of every outgoing segment, with the
- * exception of <SYN> segments, MUST be right-shifted by
- * Rcv.Wind.Shift bits:
- */
tcp_v4_send_ack(sk, skb, seq,
tcp_rsk(req)->rcv_nxt,
- req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
+ tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale,
tcp_rsk_tsval(tcp_rsk(req)),
READ_ONCE(req->ts_recent),
0, &key,
@@ -1667,7 +1675,8 @@ static void tcp_v4_init_req(struct request_sock *req,
static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
struct sk_buff *skb,
struct flowi *fl,
- struct request_sock *req)
+ struct request_sock *req,
+ u32 tw_isn)
{
tcp_v4_init_req(req, sk, skb);
@@ -1934,9 +1943,9 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
return 0;
reset:
- tcp_v4_send_reset(rsk, skb);
+ tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason));
discard:
- kfree_skb_reason(skb, reason);
+ sk_skb_reason_drop(sk, skb, reason);
/* Be careful here. If this function gets more complicated and
* gcc suffers from register pressure on the x86, sk (in %ebx)
* might be destroyed here. This current version compiles correctly,
@@ -1995,7 +2004,7 @@ int tcp_v4_early_demux(struct sk_buff *skb)
bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
enum skb_drop_reason *reason)
{
- u32 limit, tail_gso_size, tail_gso_segs;
+ u32 tail_gso_size, tail_gso_segs;
struct skb_shared_info *shinfo;
const struct tcphdr *th;
struct tcphdr *thtail;
@@ -2004,6 +2013,7 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
bool fragstolen;
u32 gso_segs;
u32 gso_size;
+ u64 limit;
int delta;
/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
@@ -2045,10 +2055,7 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
((TCP_SKB_CB(tail)->tcp_flags ^
TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
-#ifdef CONFIG_TLS_DEVICE
- tail->decrypted != skb->decrypted ||
-#endif
- !mptcp_skb_can_collapse(tail, skb) ||
+ !tcp_skb_can_collapse_rx(tail, skb) ||
thtail->doff != th->doff ||
memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
goto no_coalesce;
@@ -2101,7 +2108,13 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
__skb_push(skb, hdrlen);
no_coalesce:
- limit = (u32)READ_ONCE(sk->sk_rcvbuf) + (u32)(READ_ONCE(sk->sk_sndbuf) >> 1);
+ /* sk->sk_backlog.len is reset only at the end of __release_sock().
+ * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach
+ * sk_rcvbuf in normal conditions.
+ */
+ limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1;
+
+ limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1;
/* Only socket owner can try to collapse/prune rx queues
* to reduce memory overhead, so add a little headroom here.
@@ -2109,6 +2122,8 @@ no_coalesce:
*/
limit += 64 * 1024;
+ limit = min_t(u64, limit, UINT_MAX);
+
if (unlikely(sk_add_backlog(sk, skb, limit))) {
bh_unlock_sock(sk);
*reason = SKB_DROP_REASON_SOCKET_BACKLOG;
@@ -2148,7 +2163,6 @@ static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
skb->len - th->doff * 4);
TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
- TCP_SKB_CB(skb)->tcp_tw_isn = 0;
TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
TCP_SKB_CB(skb)->sacked = 0;
TCP_SKB_CB(skb)->has_rxtstamp =
@@ -2167,9 +2181,10 @@ int tcp_v4_rcv(struct sk_buff *skb)
int dif = inet_iif(skb);
const struct iphdr *iph;
const struct tcphdr *th;
+ struct sock *sk = NULL;
bool refcounted;
- struct sock *sk;
int ret;
+ u32 isn;
drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
if (skb->pkt_type != PACKET_HOST)
@@ -2207,7 +2222,6 @@ lookup:
if (!sk)
goto no_tcp_socket;
-process:
if (sk->sk_state == TCP_TIME_WAIT)
goto do_time_wait;
@@ -2279,7 +2293,10 @@ process:
} else {
drop_reason = tcp_child_process(sk, nsk, skb);
if (drop_reason) {
- tcp_v4_send_reset(nsk, skb);
+ enum sk_rst_reason rst_reason;
+
+ rst_reason = sk_rst_convert_drop_reason(drop_reason);
+ tcp_v4_send_reset(nsk, skb, rst_reason);
goto discard_and_relse;
}
sock_put(sk);
@@ -2287,6 +2304,7 @@ process:
}
}
+process:
if (static_branch_unlikely(&ip4_min_ttl)) {
/* min_ttl can be changed concurrently from do_ip_setsockopt() */
if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
@@ -2357,13 +2375,13 @@ csum_error:
bad_packet:
__TCP_INC_STATS(net, TCP_MIB_INERRS);
} else {
- tcp_v4_send_reset(NULL, skb);
+ tcp_v4_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason));
}
discard_it:
SKB_DR_OR(drop_reason, NOT_SPECIFIED);
/* Discard frame. */
- kfree_skb_reason(skb, drop_reason);
+ sk_skb_reason_drop(sk, skb, drop_reason);
return 0;
discard_and_relse:
@@ -2385,7 +2403,7 @@ do_time_wait:
inet_twsk_put(inet_twsk(sk));
goto csum_error;
}
- switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
+ switch (tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn)) {
case TCP_TW_SYN: {
struct sock *sk2 = inet_lookup_listener(net,
net->ipv4.tcp_death_row.hashinfo,
@@ -2399,6 +2417,7 @@ do_time_wait:
sk = sk2;
tcp_v4_restore_cb(skb);
refcounted = false;
+ __this_cpu_write(tcp_tw_isn, isn);
goto process;
}
}
@@ -2408,7 +2427,7 @@ do_time_wait:
tcp_v4_timewait_ack(sk, skb);
break;
case TCP_TW_RST:
- tcp_v4_send_reset(sk, skb);
+ tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET);
inet_twsk_deschedule_put(inet_twsk(sk));
goto discard_it;
case TCP_TW_SUCCESS:;
@@ -2418,7 +2437,6 @@ do_time_wait:
static struct timewait_sock_ops tcp_timewait_sock_ops = {
.twsk_obj_size = sizeof(struct tcp_timewait_sock),
- .twsk_unique = tcp_twsk_unique,
.twsk_destructor= tcp_twsk_destructor,
};
@@ -2495,10 +2513,25 @@ static void tcp_md5sig_info_free_rcu(struct rcu_head *head)
}
#endif
+static void tcp_release_user_frags(struct sock *sk)
+{
+#ifdef CONFIG_PAGE_POOL
+ unsigned long index;
+ void *netmem;
+
+ xa_for_each(&sk->sk_user_frags, index, netmem)
+ WARN_ON_ONCE(!napi_pp_put_page((__force netmem_ref)netmem));
+#endif
+}
+
void tcp_v4_destroy_sock(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
+ tcp_release_user_frags(sk);
+
+ xa_destroy(&sk->sk_user_frags);
+
trace_tcp_destroy_sock(sk);
tcp_clear_xmit_timers(sk);
@@ -2931,7 +2964,7 @@ static void get_timewait4_sock(const struct inet_timewait_sock *tw,
seq_printf(f, "%4d: %08X:%04X %08X:%04X"
" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
- i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
+ i, src, srcp, dest, destp, READ_ONCE(tw->tw_substate), 0, 0,
3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
refcount_read(&tw->tw_refcnt), tw);
}
@@ -3493,6 +3526,7 @@ static int __net_init tcp_sk_init(struct net *net)
net->ipv4.sysctl_tcp_shrink_window = 0;
net->ipv4.sysctl_tcp_pingpong_thresh = 1;
+ net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN);
return 0;
}
@@ -3501,13 +3535,25 @@ static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
{
struct net *net;
- tcp_twsk_purge(net_exit_list, AF_INET);
+ /* make sure concurrent calls to tcp_sk_exit_batch from net_cleanup_work
+ * and failed setup_net error unwinding path are serialized.
+ *
+ * tcp_twsk_purge() handles twsk in any dead netns, not just those in
+ * net_exit_list, the thread that dismantles a particular twsk must
+ * do so without other thread progressing to refcount_dec_and_test() of
+ * tcp_death_row.tw_refcount.
+ */
+ mutex_lock(&tcp_exit_batch_mutex);
+
+ tcp_twsk_purge(net_exit_list);
list_for_each_entry(net, net_exit_list, exit_list) {
inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
tcp_fastopen_ctx_destroy(net);
}
+
+ mutex_unlock(&tcp_exit_batch_mutex);
}
static struct pernet_operations __net_initdata tcp_sk_ops = {
@@ -3607,7 +3653,9 @@ void __init tcp_v4_init(void)
*/
inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
- per_cpu(ipv4_tcp_sk, cpu) = sk;
+ sk->sk_clockid = CLOCK_MONOTONIC;
+
+ per_cpu(ipv4_tcp_sk.sock, cpu) = sk;
}
if (register_pernet_subsys(&tcp_sk_ops))
panic("Failed to create the TCP control socket.\n");
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index c2a925538542..95669935494e 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -617,8 +617,13 @@ static struct genl_family tcp_metrics_nl_family;
static const struct nla_policy tcp_metrics_nl_policy[TCP_METRICS_ATTR_MAX + 1] = {
[TCP_METRICS_ATTR_ADDR_IPV4] = { .type = NLA_U32, },
- [TCP_METRICS_ATTR_ADDR_IPV6] = { .type = NLA_BINARY,
- .len = sizeof(struct in6_addr), },
+ [TCP_METRICS_ATTR_ADDR_IPV6] =
+ NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)),
+
+ [TCP_METRICS_ATTR_SADDR_IPV4] = { .type = NLA_U32, },
+ [TCP_METRICS_ATTR_SADDR_IPV6] =
+ NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)),
+
/* Following attributes are not received for GET/DEL,
* we keep them for reference
*/
@@ -766,6 +771,7 @@ static int tcp_metrics_nl_dump(struct sk_buff *skb,
unsigned int max_rows = 1U << tcp_metrics_hash_log;
unsigned int row, s_row = cb->args[0];
int s_col = cb->args[1], col = s_col;
+ int res = 0;
for (row = s_row; row < max_rows; row++, s_col = 0) {
struct tcp_metrics_block *tm;
@@ -778,7 +784,8 @@ static int tcp_metrics_nl_dump(struct sk_buff *skb,
continue;
if (col < s_col)
continue;
- if (tcp_metrics_dump_info(skb, cb, tm) < 0) {
+ res = tcp_metrics_dump_info(skb, cb, tm);
+ if (res < 0) {
rcu_read_unlock();
goto done;
}
@@ -789,7 +796,7 @@ static int tcp_metrics_nl_dump(struct sk_buff *skb,
done:
cb->args[0] = row;
cb->args[1] = col;
- return skb->len;
+ return res;
}
static int __parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr,
@@ -808,8 +815,6 @@ static int __parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr,
if (a) {
struct in6_addr in6;
- if (nla_len(a) != sizeof(struct in6_addr))
- return -EINVAL;
in6 = nla_get_in6_addr(a);
inetpeer_set_addr_v6(addr, &in6);
if (hash)
@@ -986,6 +991,7 @@ static struct genl_family tcp_metrics_nl_family __ro_after_init = {
.maxattr = TCP_METRICS_ATTR_MAX,
.policy = tcp_metrics_nl_policy,
.netnsok = true,
+ .parallel_ops = true,
.module = THIS_MODULE,
.small_ops = tcp_metrics_nl_ops,
.n_small_ops = ARRAY_SIZE(tcp_metrics_nl_ops),
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index f0761f060a83..bb1fe1ba867a 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -22,6 +22,7 @@
#include <net/tcp.h>
#include <net/xfrm.h>
#include <net/busy_poll.h>
+#include <net/rstreason.h>
static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
{
@@ -51,16 +52,17 @@ tcp_timewait_check_oow_rate_limit(struct inet_timewait_sock *tw,
return TCP_TW_SUCCESS;
}
-static void twsk_rcv_nxt_update(struct tcp_timewait_sock *tcptw, u32 seq)
+static void twsk_rcv_nxt_update(struct tcp_timewait_sock *tcptw, u32 seq,
+ u32 rcv_nxt)
{
#ifdef CONFIG_TCP_AO
struct tcp_ao_info *ao;
ao = rcu_dereference(tcptw->ao_info);
- if (unlikely(ao && seq < tcptw->tw_rcv_nxt))
+ if (unlikely(ao && seq < rcv_nxt))
WRITE_ONCE(ao->rcv_sne, ao->rcv_sne + 1);
#endif
- tcptw->tw_rcv_nxt = seq;
+ WRITE_ONCE(tcptw->tw_rcv_nxt, seq);
}
/*
@@ -95,45 +97,48 @@ static void twsk_rcv_nxt_update(struct tcp_timewait_sock *tcptw, u32 seq)
*/
enum tcp_tw_status
tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
- const struct tcphdr *th)
+ const struct tcphdr *th, u32 *tw_isn)
{
- struct tcp_options_received tmp_opt;
struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
+ u32 rcv_nxt = READ_ONCE(tcptw->tw_rcv_nxt);
+ struct tcp_options_received tmp_opt;
bool paws_reject = false;
+ int ts_recent_stamp;
tmp_opt.saw_tstamp = 0;
- if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
+ ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp);
+ if (th->doff > (sizeof(*th) >> 2) && ts_recent_stamp) {
tcp_parse_options(twsk_net(tw), skb, &tmp_opt, 0, NULL);
if (tmp_opt.saw_tstamp) {
if (tmp_opt.rcv_tsecr)
tmp_opt.rcv_tsecr -= tcptw->tw_ts_offset;
- tmp_opt.ts_recent = tcptw->tw_ts_recent;
- tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
+ tmp_opt.ts_recent = READ_ONCE(tcptw->tw_ts_recent);
+ tmp_opt.ts_recent_stamp = ts_recent_stamp;
paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
}
}
- if (tw->tw_substate == TCP_FIN_WAIT2) {
+ if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2) {
/* Just repeat all the checks of tcp_rcv_state_process() */
/* Out of window, send ACK */
if (paws_reject ||
!tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
- tcptw->tw_rcv_nxt,
- tcptw->tw_rcv_nxt + tcptw->tw_rcv_wnd))
+ rcv_nxt,
+ rcv_nxt + tcptw->tw_rcv_wnd))
return tcp_timewait_check_oow_rate_limit(
tw, skb, LINUX_MIB_TCPACKSKIPPEDFINWAIT2);
if (th->rst)
goto kill;
- if (th->syn && !before(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt))
+ if (th->syn && !before(TCP_SKB_CB(skb)->seq, rcv_nxt))
return TCP_TW_RST;
/* Dup ACK? */
if (!th->ack ||
- !after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) ||
+ !after(TCP_SKB_CB(skb)->end_seq, rcv_nxt) ||
TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) {
inet_twsk_put(tw);
return TCP_TW_SUCCESS;
@@ -143,16 +148,19 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
* reset.
*/
if (!th->fin ||
- TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1)
+ TCP_SKB_CB(skb)->end_seq != rcv_nxt + 1)
return TCP_TW_RST;
/* FIN arrived, enter true time-wait state. */
- tw->tw_substate = TCP_TIME_WAIT;
- twsk_rcv_nxt_update(tcptw, TCP_SKB_CB(skb)->end_seq);
+ WRITE_ONCE(tw->tw_substate, TCP_TIME_WAIT);
+ twsk_rcv_nxt_update(tcptw, TCP_SKB_CB(skb)->end_seq,
+ rcv_nxt);
if (tmp_opt.saw_tstamp) {
- tcptw->tw_ts_recent_stamp = ktime_get_seconds();
- tcptw->tw_ts_recent = tmp_opt.rcv_tsval;
+ WRITE_ONCE(tcptw->tw_ts_recent_stamp,
+ ktime_get_seconds());
+ WRITE_ONCE(tcptw->tw_ts_recent,
+ tmp_opt.rcv_tsval);
}
inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN);
@@ -177,7 +185,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
*/
if (!paws_reject &&
- (TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt &&
+ (TCP_SKB_CB(skb)->seq == rcv_nxt &&
(TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) {
/* In window segment, it may be only reset or bare ack. */
@@ -196,8 +204,10 @@ kill:
}
if (tmp_opt.saw_tstamp) {
- tcptw->tw_ts_recent = tmp_opt.rcv_tsval;
- tcptw->tw_ts_recent_stamp = ktime_get_seconds();
+ WRITE_ONCE(tcptw->tw_ts_recent,
+ tmp_opt.rcv_tsval);
+ WRITE_ONCE(tcptw->tw_ts_recent_stamp,
+ ktime_get_seconds());
}
inet_twsk_put(tw);
@@ -222,13 +232,13 @@ kill:
*/
if (th->syn && !th->rst && !th->ack && !paws_reject &&
- (after(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt) ||
+ (after(TCP_SKB_CB(skb)->seq, rcv_nxt) ||
(tmp_opt.saw_tstamp &&
- (s32)(tcptw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) {
+ (s32)(READ_ONCE(tcptw->tw_ts_recent) - tmp_opt.rcv_tsval) < 0))) {
u32 isn = tcptw->tw_snd_nxt + 65535 + 2;
if (isn == 0)
isn++;
- TCP_SKB_CB(skb)->tcp_tw_isn = isn;
+ *tw_isn = isn;
return TCP_TW_SYN;
}
@@ -338,17 +348,10 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
if (state == TCP_TIME_WAIT)
timeo = TCP_TIMEWAIT_LEN;
- /* tw_timer is pinned, so we need to make sure BH are disabled
- * in following section, otherwise timer handler could run before
- * we complete the initialization.
- */
- local_bh_disable();
- inet_twsk_schedule(tw, timeo);
/* Linkage updates.
* Note that access to tw after this point is illegal.
*/
- inet_twsk_hashdance(tw, sk, net->ipv4.tcp_death_row.hashinfo);
- local_bh_enable();
+ inet_twsk_hashdance_schedule(tw, sk, net->ipv4.tcp_death_row.hashinfo, timeo);
} else {
/* Sorry, if we're out of memory, just CLOSE this
* socket up. We've got bigger problems than
@@ -388,7 +391,7 @@ void tcp_twsk_destructor(struct sock *sk)
}
EXPORT_SYMBOL_GPL(tcp_twsk_destructor);
-void tcp_twsk_purge(struct list_head *net_exit_list, int family)
+void tcp_twsk_purge(struct list_head *net_exit_list)
{
bool purged_once = false;
struct net *net;
@@ -396,14 +399,13 @@ void tcp_twsk_purge(struct list_head *net_exit_list, int family)
list_for_each_entry(net, net_exit_list, exit_list) {
if (net->ipv4.tcp_death_row.hashinfo->pernet) {
/* Even if tw_refcount == 1, we must clean up kernel reqsk */
- inet_twsk_purge(net->ipv4.tcp_death_row.hashinfo, family);
+ inet_twsk_purge(net->ipv4.tcp_death_row.hashinfo);
} else if (!purged_once) {
- inet_twsk_purge(&tcp_hashinfo, family);
+ inet_twsk_purge(&tcp_hashinfo);
purged_once = true;
}
}
}
-EXPORT_SYMBOL_GPL(tcp_twsk_purge);
/* Warning : This function is called without sk_listener being locked.
* Be sure to read socket fields once, as their value could change under us.
@@ -515,9 +517,6 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
const struct tcp_sock *oldtp;
struct tcp_sock *newtp;
u32 seq;
-#ifdef CONFIG_TCP_AO
- struct tcp_ao_key *ao_key;
-#endif
if (!newsk)
return NULL;
@@ -608,10 +607,14 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
#endif
#ifdef CONFIG_TCP_AO
newtp->ao_info = NULL;
- ao_key = treq->af_specific->ao_lookup(sk, req,
- tcp_rsk(req)->ao_keyid, -1);
- if (ao_key)
- newtp->tcp_header_len += tcp_ao_len_aligned(ao_key);
+
+ if (tcp_rsk_used_ao(req)) {
+ struct tcp_ao_key *ao_key;
+
+ ao_key = treq->af_specific->ao_lookup(sk, req, tcp_rsk(req)->ao_keyid, -1);
+ if (ao_key)
+ newtp->tcp_header_len += tcp_ao_len_aligned(ao_key);
+ }
#endif
if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len)
newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
@@ -625,6 +628,8 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
__TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS);
+ xa_init_flags(&newsk->sk_user_frags, XA_FLAGS_ALLOC1);
+
return newsk;
}
EXPORT_SYMBOL(tcp_create_openreq_child);
@@ -783,8 +788,11 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
/* RFC793: "first check sequence number". */
- if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
- tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rsk_rcv_wnd)) {
+ if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq,
+ TCP_SKB_CB(skb)->end_seq,
+ tcp_rsk(req)->rcv_nxt,
+ tcp_rsk(req)->rcv_nxt +
+ tcp_synack_window(req))) {
/* Out of window: send ACK and drop. */
if (!(flg & TCP_FLAG_RST) &&
!tcp_oow_rate_limited(sock_net(sk), skb,
@@ -879,7 +887,7 @@ embryonic_reset:
* avoid becoming vulnerable to outside attack aiming at
* resetting legit local connections.
*/
- req->rsk_ops->send_reset(sk, skb);
+ req->rsk_ops->send_reset(sk, skb, SK_RST_REASON_INVALID_SYN);
} else if (fastopen) { /* received a valid RST pkt */
reqsk_fastopen_remove(sk, req, true);
tcp_reset(sk, skb);
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index ebe4722bb020..e4ad3311e148 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -28,6 +28,70 @@ static void tcp_gso_tstamp(struct sk_buff *skb, unsigned int ts_seq,
}
}
+static void __tcpv4_gso_segment_csum(struct sk_buff *seg,
+ __be32 *oldip, __be32 newip,
+ __be16 *oldport, __be16 newport)
+{
+ struct tcphdr *th;
+ struct iphdr *iph;
+
+ if (*oldip == newip && *oldport == newport)
+ return;
+
+ th = tcp_hdr(seg);
+ iph = ip_hdr(seg);
+
+ inet_proto_csum_replace4(&th->check, seg, *oldip, newip, true);
+ inet_proto_csum_replace2(&th->check, seg, *oldport, newport, false);
+ *oldport = newport;
+
+ csum_replace4(&iph->check, *oldip, newip);
+ *oldip = newip;
+}
+
+static struct sk_buff *__tcpv4_gso_segment_list_csum(struct sk_buff *segs)
+{
+ const struct tcphdr *th;
+ const struct iphdr *iph;
+ struct sk_buff *seg;
+ struct tcphdr *th2;
+ struct iphdr *iph2;
+
+ seg = segs;
+ th = tcp_hdr(seg);
+ iph = ip_hdr(seg);
+ th2 = tcp_hdr(seg->next);
+ iph2 = ip_hdr(seg->next);
+
+ if (!(*(const u32 *)&th->source ^ *(const u32 *)&th2->source) &&
+ iph->daddr == iph2->daddr && iph->saddr == iph2->saddr)
+ return segs;
+
+ while ((seg = seg->next)) {
+ th2 = tcp_hdr(seg);
+ iph2 = ip_hdr(seg);
+
+ __tcpv4_gso_segment_csum(seg,
+ &iph2->saddr, iph->saddr,
+ &th2->source, th->source);
+ __tcpv4_gso_segment_csum(seg,
+ &iph2->daddr, iph->daddr,
+ &th2->dest, th->dest);
+ }
+
+ return segs;
+}
+
+static struct sk_buff *__tcp4_gso_segment_list(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ skb = skb_segment_list(skb, features, skb_mac_header_len(skb));
+ if (IS_ERR(skb))
+ return skb;
+
+ return __tcpv4_gso_segment_list_csum(skb);
+}
+
static struct sk_buff *tcp4_gso_segment(struct sk_buff *skb,
netdev_features_t features)
{
@@ -37,6 +101,9 @@ static struct sk_buff *tcp4_gso_segment(struct sk_buff *skb,
if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
return ERR_PTR(-EINVAL);
+ if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST)
+ return __tcp4_gso_segment_list(skb, features);
+
if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
const struct iphdr *iph = ip_hdr(skb);
struct tcphdr *th = tcp_hdr(skb);
@@ -73,6 +140,9 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
if (thlen < sizeof(*th))
goto out;
+ if (unlikely(skb_checksum_start(skb) != skb_transport_header(skb)))
+ goto out;
+
if (!pskb_may_pull(skb, thlen))
goto out;
@@ -178,63 +248,76 @@ out:
return segs;
}
-struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb)
+struct sk_buff *tcp_gro_lookup(struct list_head *head, struct tcphdr *th)
{
- struct sk_buff *pp = NULL;
+ struct tcphdr *th2;
struct sk_buff *p;
+
+ list_for_each_entry(p, head, list) {
+ if (!NAPI_GRO_CB(p)->same_flow)
+ continue;
+
+ th2 = tcp_hdr(p);
+ if (*(u32 *)&th->source ^ *(u32 *)&th2->source) {
+ NAPI_GRO_CB(p)->same_flow = 0;
+ continue;
+ }
+
+ return p;
+ }
+
+ return NULL;
+}
+
+struct tcphdr *tcp_gro_pull_header(struct sk_buff *skb)
+{
+ unsigned int thlen, hlen, off;
struct tcphdr *th;
- struct tcphdr *th2;
- unsigned int len;
- unsigned int thlen;
- __be32 flags;
- unsigned int mss = 1;
- unsigned int hlen;
- unsigned int off;
- int flush = 1;
- int i;
off = skb_gro_offset(skb);
hlen = off + sizeof(*th);
th = skb_gro_header(skb, hlen, off);
if (unlikely(!th))
- goto out;
+ return NULL;
thlen = th->doff * 4;
if (thlen < sizeof(*th))
- goto out;
+ return NULL;
hlen = off + thlen;
if (!skb_gro_may_pull(skb, hlen)) {
th = skb_gro_header_slow(skb, hlen, off);
if (unlikely(!th))
- goto out;
+ return NULL;
}
skb_gro_pull(skb, thlen);
- len = skb_gro_len(skb);
- flags = tcp_flag_word(th);
-
- list_for_each_entry(p, head, list) {
- if (!NAPI_GRO_CB(p)->same_flow)
- continue;
+ return th;
+}
- th2 = tcp_hdr(p);
+struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb,
+ struct tcphdr *th)
+{
+ unsigned int thlen = th->doff * 4;
+ struct sk_buff *pp = NULL;
+ struct sk_buff *p;
+ struct tcphdr *th2;
+ unsigned int len;
+ __be32 flags;
+ unsigned int mss = 1;
+ int flush = 1;
+ int i;
- if (*(u32 *)&th->source ^ *(u32 *)&th2->source) {
- NAPI_GRO_CB(p)->same_flow = 0;
- continue;
- }
+ len = skb_gro_len(skb);
+ flags = tcp_flag_word(th);
- goto found;
- }
- p = NULL;
- goto out_check_final;
+ p = tcp_gro_lookup(head, th);
+ if (!p)
+ goto out_check_final;
-found:
- /* Include the IP ID check below from the inner most IP hdr */
- flush = NAPI_GRO_CB(p)->flush;
- flush |= (__force int)(flags & TCP_FLAG_CWR);
+ th2 = tcp_hdr(p);
+ flush = (__force int)(flags & TCP_FLAG_CWR);
flush |= (__force int)((flags ^ tcp_flag_word(th2)) &
~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH));
flush |= (__force int)(th->ack_seq ^ th2->ack_seq);
@@ -242,16 +325,7 @@ found:
flush |= *(u32 *)((u8 *)th + i) ^
*(u32 *)((u8 *)th2 + i);
- /* When we receive our second frame we can made a decision on if we
- * continue this flow as an atomic flow with a fixed ID or if we use
- * an incrementing ID.
- */
- if (NAPI_GRO_CB(p)->flush_id != 1 ||
- NAPI_GRO_CB(p)->count != 1 ||
- !NAPI_GRO_CB(p)->is_atomic)
- flush |= NAPI_GRO_CB(p)->flush_id;
- else
- NAPI_GRO_CB(p)->is_atomic = false;
+ flush |= gro_receive_network_flush(th, th2, p);
mss = skb_shinfo(p)->gso_size;
@@ -265,9 +339,19 @@ found:
flush |= (len - 1) >= mss;
flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq);
-#ifdef CONFIG_TLS_DEVICE
- flush |= p->decrypted ^ skb->decrypted;
-#endif
+ flush |= skb_cmp_decrypted(p, skb);
+
+ if (unlikely(NAPI_GRO_CB(p)->is_flist)) {
+ flush |= (__force int)(flags ^ tcp_flag_word(th2));
+ flush |= skb->ip_summed != p->ip_summed;
+ flush |= skb->csum_level != p->csum_level;
+ flush |= NAPI_GRO_CB(p)->count >= 64;
+
+ if (flush || skb_gro_receive_list(p, skb))
+ mss = 1;
+
+ goto out_check_final;
+ }
if (flush || skb_gro_receive(p, skb)) {
mss = 1;
@@ -290,7 +374,6 @@ out_check_final:
if (p && (!NAPI_GRO_CB(skb)->same_flow || flush))
pp = p;
-out:
NAPI_GRO_CB(skb)->flush |= (flush != 0);
return pp;
@@ -316,30 +399,80 @@ void tcp_gro_complete(struct sk_buff *skb)
}
EXPORT_SYMBOL(tcp_gro_complete);
+static void tcp4_check_fraglist_gro(struct list_head *head, struct sk_buff *skb,
+ struct tcphdr *th)
+{
+ const struct iphdr *iph;
+ struct sk_buff *p;
+ struct sock *sk;
+ struct net *net;
+ int iif, sdif;
+
+ if (likely(!(skb->dev->features & NETIF_F_GRO_FRAGLIST)))
+ return;
+
+ p = tcp_gro_lookup(head, th);
+ if (p) {
+ NAPI_GRO_CB(skb)->is_flist = NAPI_GRO_CB(p)->is_flist;
+ return;
+ }
+
+ inet_get_iif_sdif(skb, &iif, &sdif);
+ iph = skb_gro_network_header(skb);
+ net = dev_net(skb->dev);
+ sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
+ iph->saddr, th->source,
+ iph->daddr, ntohs(th->dest),
+ iif, sdif);
+ NAPI_GRO_CB(skb)->is_flist = !sk;
+ if (sk)
+ sock_put(sk);
+}
+
INDIRECT_CALLABLE_SCOPE
struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb)
{
+ struct tcphdr *th;
+
/* Don't bother verifying checksum if we're going to flush anyway. */
if (!NAPI_GRO_CB(skb)->flush &&
skb_gro_checksum_validate(skb, IPPROTO_TCP,
- inet_gro_compute_pseudo)) {
- NAPI_GRO_CB(skb)->flush = 1;
- return NULL;
- }
+ inet_gro_compute_pseudo))
+ goto flush;
+
+ th = tcp_gro_pull_header(skb);
+ if (!th)
+ goto flush;
- return tcp_gro_receive(head, skb);
+ tcp4_check_fraglist_gro(head, skb, th);
+
+ return tcp_gro_receive(head, skb, th);
+
+flush:
+ NAPI_GRO_CB(skb)->flush = 1;
+ return NULL;
}
INDIRECT_CALLABLE_SCOPE int tcp4_gro_complete(struct sk_buff *skb, int thoff)
{
- const struct iphdr *iph = ip_hdr(skb);
+ const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation];
+ const struct iphdr *iph = (struct iphdr *)(skb->data + offset);
struct tcphdr *th = tcp_hdr(skb);
+ if (unlikely(NAPI_GRO_CB(skb)->is_flist)) {
+ skb_shinfo(skb)->gso_type |= SKB_GSO_FRAGLIST | SKB_GSO_TCPV4;
+ skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
+
+ __skb_incr_checksum_unnecessary(skb);
+
+ return 0;
+ }
+
th->check = ~tcp_v4_check(skb->len - thoff, iph->saddr,
iph->daddr, 0);
skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4 |
- (NAPI_GRO_CB(skb)->is_atomic * SKB_GSO_TCP_FIXEDID);
+ (NAPI_GRO_CB(skb)->ip_fixedid * SKB_GSO_TCP_FIXEDID);
tcp_gro_complete(skb);
return 0;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index e3167ad96567..4fd746bd4d54 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -39,11 +39,13 @@
#include <net/tcp.h>
#include <net/mptcp.h>
+#include <net/proto_memory.h>
#include <linux/compiler.h>
#include <linux/gfp.h>
#include <linux/module.h>
#include <linux/static_key.h>
+#include <linux/skbuff_ref.h>
#include <trace/events/tcp.h>
@@ -203,16 +205,17 @@ static inline void tcp_event_ack_sent(struct sock *sk, u32 rcv_nxt)
* This MUST be enforced by all callers.
*/
void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,
- __u32 *rcv_wnd, __u32 *window_clamp,
+ __u32 *rcv_wnd, __u32 *__window_clamp,
int wscale_ok, __u8 *rcv_wscale,
__u32 init_rcv_wnd)
{
unsigned int space = (__space < 0 ? 0 : __space);
+ u32 window_clamp = READ_ONCE(*__window_clamp);
/* If no clamp set the clamp to the max possible scaled window */
- if (*window_clamp == 0)
- (*window_clamp) = (U16_MAX << TCP_MAX_WSCALE);
- space = min(*window_clamp, space);
+ if (window_clamp == 0)
+ window_clamp = (U16_MAX << TCP_MAX_WSCALE);
+ space = min(window_clamp, space);
/* Quantize space offering to a multiple of mss if possible. */
if (space > mss)
@@ -229,7 +232,7 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,
if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows))
(*rcv_wnd) = min(space, MAX_TCP_WINDOW);
else
- (*rcv_wnd) = min_t(u32, space, U16_MAX);
+ (*rcv_wnd) = space;
if (init_rcv_wnd)
*rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss);
@@ -239,12 +242,13 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,
/* Set window scaling on max possible window */
space = max_t(u32, space, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
space = max_t(u32, space, READ_ONCE(sysctl_rmem_max));
- space = min_t(u32, space, *window_clamp);
+ space = min_t(u32, space, window_clamp);
*rcv_wscale = clamp_t(int, ilog2(space) - 15,
0, TCP_MAX_WSCALE);
}
/* Set the clamp no higher than max representable value */
- (*window_clamp) = min_t(__u32, U16_MAX << (*rcv_wscale), *window_clamp);
+ WRITE_ONCE(*__window_clamp,
+ min_t(__u32, U16_MAX << (*rcv_wscale), window_clamp));
}
EXPORT_SYMBOL(tcp_select_initial_window);
@@ -1297,7 +1301,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
tp = tcp_sk(sk);
prior_wstamp = tp->tcp_wstamp_ns;
tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache);
- skb_set_delivery_time(skb, tp->tcp_wstamp_ns, true);
+ skb_set_delivery_time(skb, tp->tcp_wstamp_ns, SKB_CLOCK_MONOTONIC);
if (clone_it) {
oskb = skb;
@@ -1499,18 +1503,22 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
}
/* Initialize TSO segments for a packet. */
-static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now)
+static int tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now)
{
+ int tso_segs;
+
if (skb->len <= mss_now) {
/* Avoid the costly divide in the normal
* non-TSO case.
*/
- tcp_skb_pcount_set(skb, 1);
TCP_SKB_CB(skb)->tcp_gso_size = 0;
- } else {
- tcp_skb_pcount_set(skb, DIV_ROUND_UP(skb->len, mss_now));
- TCP_SKB_CB(skb)->tcp_gso_size = mss_now;
+ tcp_skb_pcount_set(skb, 1);
+ return 1;
}
+ TCP_SKB_CB(skb)->tcp_gso_size = mss_now;
+ tso_segs = DIV_ROUND_UP(skb->len, mss_now);
+ tcp_skb_pcount_set(skb, tso_segs);
+ return tso_segs;
}
/* Pcount in the middle of the write queue got changed, we need to do various
@@ -1647,7 +1655,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
skb_split(skb, buff, len);
- skb_set_delivery_time(buff, skb->tstamp, true);
+ skb_set_delivery_time(buff, skb->tstamp, SKB_CLOCK_MONOTONIC);
tcp_fragment_tstamp(skb, buff);
old_factor = tcp_skb_pcount(skb);
@@ -2070,16 +2078,10 @@ static unsigned int tcp_mss_split_point(const struct sock *sk,
/* Can at least one segment of SKB be sent right now, according to the
* congestion window rules? If so, return how many segments are allowed.
*/
-static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
- const struct sk_buff *skb)
+static u32 tcp_cwnd_test(const struct tcp_sock *tp)
{
u32 in_flight, cwnd, halfcwnd;
- /* Don't be strict about the congestion window for the final FIN. */
- if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) &&
- tcp_skb_pcount(skb) == 1)
- return 1;
-
in_flight = tcp_packets_in_flight(tp);
cwnd = tcp_snd_cwnd(tp);
if (in_flight >= cwnd)
@@ -2100,10 +2102,9 @@ static int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now)
{
int tso_segs = tcp_skb_pcount(skb);
- if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) {
- tcp_set_skb_tso_segs(skb, mss_now);
- tso_segs = tcp_skb_pcount(skb);
- }
+ if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now))
+ return tcp_set_skb_tso_segs(skb, mss_now);
+
return tso_segs;
}
@@ -2343,7 +2344,8 @@ static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len)
if (unlikely(TCP_SKB_CB(skb)->eor) ||
tcp_has_tx_tstamp(skb) ||
- !skb_pure_zcopy_same(skb, next))
+ !skb_pure_zcopy_same(skb, next) ||
+ skb_frags_readable(skb) != skb_frags_readable(next))
return false;
len -= skb->len;
@@ -2403,6 +2405,21 @@ commit:
return 0;
}
+/* tcp_mtu_probe() and tcp_grow_skb() can both eat an skb (src) if
+ * all its payload was moved to another one (dst).
+ * Make sure to transfer tcp_flags, eor, and tstamp.
+ */
+static void tcp_eat_one_skb(struct sock *sk,
+ struct sk_buff *dst,
+ struct sk_buff *src)
+{
+ TCP_SKB_CB(dst)->tcp_flags |= TCP_SKB_CB(src)->tcp_flags;
+ TCP_SKB_CB(dst)->eor = TCP_SKB_CB(src)->eor;
+ tcp_skb_collapse_tstamp(dst, src);
+ tcp_unlink_write_queue(src, sk);
+ tcp_wmem_free_skb(sk, src);
+}
+
/* Create a new MTU probe if we are ready.
* MTU probe is regularly attempting to increase the path MTU by
* deliberately sending larger packets. This discovers routing
@@ -2508,16 +2525,7 @@ static int tcp_mtu_probe(struct sock *sk)
copy = min_t(int, skb->len, probe_size - len);
if (skb->len <= copy) {
- /* We've eaten all the data from this skb.
- * Throw it away. */
- TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
- /* If this is the last SKB we copy and eor is set
- * we need to propagate it to the new skb.
- */
- TCP_SKB_CB(nskb)->eor = TCP_SKB_CB(skb)->eor;
- tcp_skb_collapse_tstamp(nskb, skb);
- tcp_unlink_write_queue(skb, sk);
- tcp_wmem_free_skb(sk, skb);
+ tcp_eat_one_skb(sk, nskb, skb);
} else {
TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags &
~(TCPHDR_FIN|TCPHDR_PSH);
@@ -2683,6 +2691,35 @@ void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type)
tcp_chrono_set(tp, TCP_CHRONO_BUSY);
}
+/* First skb in the write queue is smaller than ideal packet size.
+ * Check if we can move payload from the second skb in the queue.
+ */
+static void tcp_grow_skb(struct sock *sk, struct sk_buff *skb, int amount)
+{
+ struct sk_buff *next_skb = skb->next;
+ unsigned int nlen;
+
+ if (tcp_skb_is_last(sk, skb))
+ return;
+
+ if (!tcp_skb_can_collapse(skb, next_skb))
+ return;
+
+ nlen = min_t(u32, amount, next_skb->len);
+ if (!nlen || !skb_shift(skb, next_skb, nlen))
+ return;
+
+ TCP_SKB_CB(skb)->end_seq += nlen;
+ TCP_SKB_CB(next_skb)->seq += nlen;
+
+ if (!next_skb->len) {
+ /* In case FIN is set, we need to update end_seq */
+ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
+
+ tcp_eat_one_skb(sk, skb, next_skb);
+ }
+}
+
/* This routine writes packets to the network. It advances the
* send_head. This happens as incoming acks open up the remote
* window for us.
@@ -2703,10 +2740,9 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
unsigned int tso_segs, sent_pkts;
- int cwnd_quota;
+ u32 cwnd_quota, max_segs;
int result;
bool is_cwnd_limited = false, is_rwnd_limited = false;
- u32 max_segs;
sent_pkts = 0;
@@ -2724,11 +2760,12 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
max_segs = tcp_tso_segs(sk, mss_now);
while ((skb = tcp_send_head(sk))) {
unsigned int limit;
+ int missing_bytes;
if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
/* "skb_mstamp_ns" is used as a start point for the retransmit timer */
tp->tcp_wstamp_ns = tp->tcp_clock_cache;
- skb_set_delivery_time(skb, tp->tcp_wstamp_ns, true);
+ skb_set_delivery_time(skb, tp->tcp_wstamp_ns, SKB_CLOCK_MONOTONIC);
list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
tcp_init_tso_segs(skb, mss_now);
goto repair; /* Skip network transmission */
@@ -2737,10 +2774,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
if (tcp_pacing_check(sk))
break;
- tso_segs = tcp_init_tso_segs(skb, mss_now);
- BUG_ON(!tso_segs);
-
- cwnd_quota = tcp_cwnd_test(tp, skb);
+ cwnd_quota = tcp_cwnd_test(tp);
if (!cwnd_quota) {
if (push_one == 2)
/* Force out a loss probe pkt. */
@@ -2748,6 +2782,12 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
else
break;
}
+ cwnd_quota = min(cwnd_quota, max_segs);
+ missing_bytes = cwnd_quota * mss_now - skb->len;
+ if (missing_bytes > 0)
+ tcp_grow_skb(sk, skb, missing_bytes);
+
+ tso_segs = tcp_set_skb_tso_segs(skb, mss_now);
if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) {
is_rwnd_limited = true;
@@ -2769,9 +2809,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
limit = mss_now;
if (tso_segs > 1 && !tcp_urg_mode(tp))
limit = tcp_mss_split_point(sk, skb, mss_now,
- min_t(unsigned int,
- cwnd_quota,
- max_segs),
+ cwnd_quota,
nonagle);
if (skb->len > limit &&
@@ -3227,6 +3265,8 @@ static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb)
return false;
if (skb_cloned(skb))
return false;
+ if (!skb_frags_readable(skb))
+ return false;
/* Some heuristics for collapsing over SACK'd could be invented */
if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
return false;
@@ -3387,11 +3427,6 @@ start:
err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
}
- /* To avoid taking spuriously low RTT samples based on a timestamp
- * for a transmit that never happened, always mark EVER_RETRANS
- */
- TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
-
if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RETRANS_CB_FLAG))
tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RETRANS_CB,
TCP_SKB_CB(skb)->seq, segs, err);
@@ -3401,6 +3436,12 @@ start:
} else if (err != -EBUSY) {
NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL, segs);
}
+
+ /* To avoid taking spuriously low RTT samples based on a timestamp
+ * for a transmit that never happened, always mark EVER_RETRANS
+ */
+ TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
+
return err;
}
@@ -3563,7 +3604,9 @@ void tcp_send_fin(struct sock *sk)
return;
}
} else {
- skb = alloc_skb_fclone(MAX_TCP_HEADER, sk->sk_allocation);
+ skb = alloc_skb_fclone(MAX_TCP_HEADER,
+ sk_gfp_mask(sk, GFP_ATOMIC |
+ __GFP_NOWARN));
if (unlikely(!skb))
return;
@@ -3583,7 +3626,8 @@ void tcp_send_fin(struct sock *sk)
* was unread data in the receive queue. This behavior is recommended
* by RFC 2525, section 2.17. -DaveM
*/
-void tcp_send_active_reset(struct sock *sk, gfp_t priority)
+void tcp_send_active_reset(struct sock *sk, gfp_t priority,
+ enum sk_rst_reason reason)
{
struct sk_buff *skb;
@@ -3608,7 +3652,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority)
/* skb of trace_tcp_send_reset() keeps the skb that caused RST,
* skb here is different to the troublesome skb, so use NULL
*/
- trace_tcp_send_reset(sk, NULL);
+ trace_tcp_send_reset(sk, NULL, reason);
}
/* Send a crossed SYN-ACK during socket establishment.
@@ -3711,11 +3755,11 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
#ifdef CONFIG_SYN_COOKIES
if (unlikely(synack_type == TCP_SYNACK_COOKIE && ireq->tstamp_ok))
skb_set_delivery_time(skb, cookie_init_timestamp(req, now),
- true);
+ SKB_CLOCK_MONOTONIC);
else
#endif
{
- skb_set_delivery_time(skb, now, true);
+ skb_set_delivery_time(skb, now, SKB_CLOCK_MONOTONIC);
if (!tcp_rsk(req)->snt_synack) /* Timestamp first SYNACK */
tcp_rsk(req)->snt_synack = tcp_skb_timestamp_us(skb);
}
@@ -3727,6 +3771,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
#ifdef CONFIG_TCP_AO
struct tcp_ao_key *ao_key = NULL;
u8 keyid = tcp_rsk(req)->ao_keyid;
+ u8 rnext = tcp_rsk(req)->ao_rcv_next;
ao_key = tcp_sk(sk)->af_specific->ao_lookup(sk, req_to_sk(req),
keyid, -1);
@@ -3736,6 +3781,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
* ao_keyid (RFC5925 RNextKeyID), so let's keep it simple here.
*/
if (unlikely(!ao_key)) {
+ trace_tcp_ao_synack_no_key(sk, keyid, rnext);
rcu_read_unlock();
kfree_skb(skb);
net_warn_ratelimited("TCP-AO: the keyid %u from SYN packet is not present - not sending SYNACK\n",
@@ -3802,7 +3848,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
bpf_skops_write_hdr_opt((struct sock *)sk, skb, req, syn_skb,
synack_type, &opts);
- skb_set_delivery_time(skb, now, true);
+ skb_set_delivery_time(skb, now, SKB_CLOCK_MONOTONIC);
tcp_add_tx_delay(skb, tp);
return skb;
@@ -3855,7 +3901,7 @@ static void tcp_connect_init(struct sock *sk)
tcp_ca_dst_init(sk, dst);
if (!tp->window_clamp)
- tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
+ WRITE_ONCE(tp->window_clamp, dst_metric(dst, RTAX_WINDOW));
tp->advmss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
tcp_initialize_rcv_mss(sk);
@@ -3863,7 +3909,7 @@ static void tcp_connect_init(struct sock *sk)
/* limit the window selection if the user enforce a smaller rx buffer */
if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
(tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0))
- tp->window_clamp = tcp_full_space(sk);
+ WRITE_ONCE(tp->window_clamp, tcp_full_space(sk));
rcv_wnd = tcp_rwnd_init_bpf(sk);
if (rcv_wnd == 0)
@@ -3986,7 +4032,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation);
- skb_set_delivery_time(syn, syn_data->skb_mstamp_ns, true);
+ skb_set_delivery_time(syn, syn_data->skb_mstamp_ns, SKB_CLOCK_MONOTONIC);
/* Now full SYN+DATA was cloned and sent (or not),
* remove the SYN from the original skb (syn_data)
@@ -4122,16 +4168,9 @@ EXPORT_SYMBOL(tcp_connect);
u32 tcp_delack_max(const struct sock *sk)
{
- const struct dst_entry *dst = __sk_dst_get(sk);
- u32 delack_max = inet_csk(sk)->icsk_delack_max;
+ u32 delack_from_rto_min = max(tcp_rto_min(sk), 2) - 1;
- if (dst && dst_metric_locked(dst, RTAX_RTO_MIN)) {
- u32 rto_min = dst_metric_rtt(dst, RTAX_RTO_MIN);
- u32 delack_from_rto_min = max_t(int, 1, rto_min - 1);
-
- delack_max = min_t(u32, delack_max, delack_from_rto_min);
- }
- return delack_max;
+ return min(inet_csk(sk)->icsk_delack_max, delack_from_rto_min);
}
/* Send out a delayed ack, the caller does the policy checking
diff --git a/net/ipv4/tcp_sigpool.c b/net/ipv4/tcp_sigpool.c
index 8512cb09ebc0..d8a4f192873a 100644
--- a/net/ipv4/tcp_sigpool.c
+++ b/net/ipv4/tcp_sigpool.c
@@ -10,7 +10,14 @@
#include <net/tcp.h>
static size_t __scratch_size;
-static DEFINE_PER_CPU(void __rcu *, sigpool_scratch);
+struct sigpool_scratch {
+ local_lock_t bh_lock;
+ void __rcu *pad;
+};
+
+static DEFINE_PER_CPU(struct sigpool_scratch, sigpool_scratch) = {
+ .bh_lock = INIT_LOCAL_LOCK(bh_lock),
+};
struct sigpool_entry {
struct crypto_ahash *hash;
@@ -72,7 +79,7 @@ static int sigpool_reserve_scratch(size_t size)
break;
}
- old_scratch = rcu_replace_pointer(per_cpu(sigpool_scratch, cpu),
+ old_scratch = rcu_replace_pointer(per_cpu(sigpool_scratch.pad, cpu),
scratch, lockdep_is_held(&cpool_mutex));
if (!cpu_online(cpu) || !old_scratch) {
kfree(old_scratch);
@@ -93,7 +100,7 @@ static void sigpool_scratch_free(void)
int cpu;
for_each_possible_cpu(cpu)
- kfree(rcu_replace_pointer(per_cpu(sigpool_scratch, cpu),
+ kfree(rcu_replace_pointer(per_cpu(sigpool_scratch.pad, cpu),
NULL, lockdep_is_held(&cpool_mutex)));
__scratch_size = 0;
}
@@ -277,7 +284,8 @@ int tcp_sigpool_start(unsigned int id, struct tcp_sigpool *c) __cond_acquires(RC
/* Pairs with tcp_sigpool_reserve_scratch(), scratch area is
* valid (allocated) until tcp_sigpool_end().
*/
- c->scratch = rcu_dereference_bh(*this_cpu_ptr(&sigpool_scratch));
+ local_lock_nested_bh(&sigpool_scratch.bh_lock);
+ c->scratch = rcu_dereference_bh(*this_cpu_ptr(&sigpool_scratch.pad));
return 0;
}
EXPORT_SYMBOL_GPL(tcp_sigpool_start);
@@ -286,6 +294,7 @@ void tcp_sigpool_end(struct tcp_sigpool *c) __releases(RCU_BH)
{
struct crypto_ahash *hash = crypto_ahash_reqtfm(c->req);
+ local_unlock_nested_bh(&sigpool_scratch.bh_lock);
rcu_read_unlock_bh();
ahash_request_free(c->req);
crypto_free_ahash(hash);
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index d1ad20ce1c8c..79064580c8c0 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -22,10 +22,11 @@
#include <linux/module.h>
#include <linux/gfp.h>
#include <net/tcp.h>
+#include <net/rstreason.h>
static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk)
{
- struct inet_connection_sock *icsk = inet_csk(sk);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
const struct tcp_sock *tp = tcp_sk(sk);
u32 elapsed, user_timeout;
s32 remaining;
@@ -47,7 +48,7 @@ static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk)
u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 when)
{
- struct inet_connection_sock *icsk = inet_csk(sk);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
u32 remaining, user_timeout;
s32 elapsed;
@@ -73,11 +74,7 @@ u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 when)
static void tcp_write_err(struct sock *sk)
{
- WRITE_ONCE(sk->sk_err, READ_ONCE(sk->sk_err_soft) ? : ETIMEDOUT);
- sk_error_report(sk);
-
- tcp_write_queue_purge(sk);
- tcp_done(sk);
+ tcp_done_with_error(sk, READ_ONCE(sk->sk_err_soft) ? : ETIMEDOUT);
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONTIMEOUT);
}
@@ -127,7 +124,8 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset)
(!tp->snd_wnd && !tp->packets_out))
do_reset = true;
if (do_reset)
- tcp_send_active_reset(sk, GFP_ATOMIC);
+ tcp_send_active_reset(sk, GFP_ATOMIC,
+ SK_RST_REASON_TCP_ABORT_ON_MEMORY);
tcp_done(sk);
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY);
return 1;
@@ -284,6 +282,7 @@ static int tcp_write_timeout(struct sock *sk)
expired = retransmits_timed_out(sk, retry_until,
READ_ONCE(icsk->icsk_user_timeout));
tcp_fastopen_active_detect_blackhole(sk, expired);
+ mptcp_active_detect_blackhole(sk, expired);
if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RTO_CB_FLAG))
tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RTO_CB,
@@ -481,11 +480,26 @@ static bool tcp_rtx_probe0_timed_out(const struct sock *sk,
const struct sk_buff *skb,
u32 rtx_delta)
{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ u32 user_timeout = READ_ONCE(icsk->icsk_user_timeout);
const struct tcp_sock *tp = tcp_sk(sk);
- const int timeout = TCP_RTO_MAX * 2;
- u32 rcv_delta;
+ int timeout = TCP_RTO_MAX * 2;
+ s32 rcv_delta;
- rcv_delta = inet_csk(sk)->icsk_timeout - tp->rcv_tstamp;
+ if (user_timeout) {
+ /* If user application specified a TCP_USER_TIMEOUT,
+ * it does not want win 0 packets to 'reset the timer'
+ * while retransmits are not making progress.
+ */
+ if (rtx_delta > user_timeout)
+ return true;
+ timeout = min_t(u32, timeout, msecs_to_jiffies(user_timeout));
+ }
+ /* Note: timer interrupt might have been delayed by at least one jiffy,
+ * and tp->rcv_tstamp might very well have been written recently.
+ * rcv_delta can thus be negative.
+ */
+ rcv_delta = icsk->icsk_timeout - tp->rcv_tstamp;
if (rcv_delta <= timeout)
return false;
@@ -530,8 +544,6 @@ void tcp_retransmit_timer(struct sock *sk)
if (WARN_ON_ONCE(!skb))
return;
- tp->tlp_high_seq = 0;
-
if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) &&
!((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) {
/* Receiver dastardly shrinks window. Our retransmits
@@ -768,7 +780,7 @@ static void tcp_keepalive_timer (struct timer_list *t)
goto out;
}
}
- tcp_send_active_reset(sk, GFP_ATOMIC);
+ tcp_send_active_reset(sk, GFP_ATOMIC, SK_RST_REASON_TCP_STATE);
goto death;
}
@@ -795,7 +807,8 @@ static void tcp_keepalive_timer (struct timer_list *t)
icsk->icsk_probes_out > 0) ||
(user_timeout == 0 &&
icsk->icsk_probes_out >= keepalive_probes(tp))) {
- tcp_send_active_reset(sk, GFP_ATOMIC);
+ tcp_send_active_reset(sk, GFP_ATOMIC,
+ SK_RST_REASON_TCP_KEEPALIVE_TIMEOUT);
tcp_write_err(sk);
goto out;
}
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index c02bf011d4a6..8accbf4cb295 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -115,6 +115,7 @@
#include <net/addrconf.h>
#include <net/udp_tunnel.h>
#include <net/gro.h>
+#include <net/inet_dscp.h>
#if IS_ENABLED(CONFIG_IPV6)
#include <net/ipv6_stubs.h>
#endif
@@ -326,6 +327,8 @@ found:
goto fail_unlock;
}
+ sock_set_flag(sk, SOCK_RCU_FREE);
+
sk_add_node_rcu(sk, &hslot->head);
hslot->count++;
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
@@ -342,7 +345,7 @@ found:
hslot2->count++;
spin_unlock(&hslot2->lock);
}
- sock_set_flag(sk, SOCK_RCU_FREE);
+
error = 0;
fail_unlock:
spin_unlock_bh(&hslot->lock);
@@ -363,7 +366,7 @@ int udp_v4_get_port(struct sock *sk, unsigned short snum)
return udp_lib_get_port(sk, snum, hash2_nulladdr);
}
-static int compute_score(struct sock *sk, struct net *net,
+static int compute_score(struct sock *sk, const struct net *net,
__be32 saddr, __be16 sport,
__be32 daddr, unsigned short hnum,
int dif, int sdif)
@@ -418,7 +421,7 @@ u32 udp_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport,
}
/* called with rcu_read_lock() */
-static struct sock *udp4_lib_lookup2(struct net *net,
+static struct sock *udp4_lib_lookup2(const struct net *net,
__be32 saddr, __be16 sport,
__be32 daddr, unsigned int hnum,
int dif, int sdif,
@@ -427,15 +430,21 @@ static struct sock *udp4_lib_lookup2(struct net *net,
{
struct sock *sk, *result;
int score, badness;
+ bool need_rescore;
result = NULL;
badness = 0;
udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
- score = compute_score(sk, net, saddr, sport,
- daddr, hnum, dif, sdif);
+ need_rescore = false;
+rescore:
+ score = compute_score(need_rescore ? result : sk, net, saddr,
+ sport, daddr, hnum, dif, sdif);
if (score > badness) {
badness = score;
+ if (need_rescore)
+ continue;
+
if (sk->sk_state == TCP_ESTABLISHED) {
result = sk;
continue;
@@ -456,9 +465,14 @@ static struct sock *udp4_lib_lookup2(struct net *net,
if (IS_ERR(result))
continue;
- badness = compute_score(result, net, saddr, sport,
- daddr, hnum, dif, sdif);
-
+ /* compute_score is too long of a function to be
+ * inlined, and calling it again here yields
+ * measureable overhead for some
+ * workloads. Work around it by jumping
+ * backwards to rescore 'result'.
+ */
+ need_rescore = true;
+ goto rescore;
}
}
return result;
@@ -467,7 +481,7 @@ static struct sock *udp4_lib_lookup2(struct net *net,
/* UDP is nearly always wildcards out the wazoo, it makes no sense to try
* harder than this. -DaveM
*/
-struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
+struct sock *__udp4_lib_lookup(const struct net *net, __be32 saddr,
__be16 sport, __be32 daddr, __be16 dport, int dif,
int sdif, struct udp_table *udptable, struct sk_buff *skb)
{
@@ -532,7 +546,8 @@ static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb,
struct sock *udp4_lib_lookup_skb(const struct sk_buff *skb,
__be16 sport, __be16 dport)
{
- const struct iphdr *iph = ip_hdr(skb);
+ const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation];
+ const struct iphdr *iph = (struct iphdr *)(skb->data + offset);
struct net *net = dev_net(skb->dev);
int iif, sdif;
@@ -547,7 +562,7 @@ struct sock *udp4_lib_lookup_skb(const struct sk_buff *skb,
* Does increment socket refcount.
*/
#if IS_ENABLED(CONFIG_NF_TPROXY_IPV4) || IS_ENABLED(CONFIG_NF_SOCKET_IPV4)
-struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
+struct sock *udp4_lib_lookup(const struct net *net, __be32 saddr, __be16 sport,
__be32 daddr, __be16 dport, int dif)
{
struct sock *sk;
@@ -926,8 +941,7 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4,
kfree_skb(skb);
return -EINVAL;
}
- if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite ||
- dst_xfrm(skb_dst(skb))) {
+ if (is_udplite || dst_xfrm(skb_dst(skb))) {
kfree_skb(skb);
return -EIO;
}
@@ -1123,16 +1137,17 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
if (msg->msg_controllen) {
err = udp_cmsg_send(sk, msg, &ipc.gso_size);
- if (err > 0)
+ if (err > 0) {
err = ip_cmsg_send(sk, msg, &ipc,
sk->sk_family == AF_INET6);
+ connected = 0;
+ }
if (unlikely(err < 0)) {
kfree(ipc.opt);
return err;
}
if (ipc.opt)
free = 1;
- connected = 0;
}
if (!ipc.opt) {
struct ip_options_rcu *inet_opt;
@@ -1205,7 +1220,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
}
if (connected)
- rt = (struct rtable *)sk_dst_check(sk, 0);
+ rt = dst_rtable(sk_dst_check(sk, 0));
if (!rt) {
struct net *net = sock_net(sk);
@@ -1499,13 +1514,15 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
struct sk_buff_head *list = &sk->sk_receive_queue;
int rmem, err = -ENOMEM;
spinlock_t *busy = NULL;
- int size;
+ bool becomes_readable;
+ int size, rcvbuf;
- /* try to avoid the costly atomic add/sub pair when the receive
- * queue is full; always allow at least a packet
+ /* Immediately drop when the receive queue is full.
+ * Always allow at least one packet.
*/
rmem = atomic_read(&sk->sk_rmem_alloc);
- if (rmem > sk->sk_rcvbuf)
+ rcvbuf = READ_ONCE(sk->sk_rcvbuf);
+ if (rmem > rcvbuf)
goto drop;
/* Under mem pressure, it might be helpful to help udp_recvmsg()
@@ -1514,7 +1531,7 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
* - Less cache line misses at copyout() time
* - Less work at consume_skb() (less alien page frag freeing)
*/
- if (rmem > (sk->sk_rcvbuf >> 1)) {
+ if (rmem > (rcvbuf >> 1)) {
skb_condense(skb);
busy = busylock_acquire(sk);
@@ -1522,12 +1539,7 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
size = skb->truesize;
udp_set_dev_scratch(skb);
- /* we drop only if the receive buf is full and the receive
- * queue contains some other skb
- */
- rmem = atomic_add_return(size, &sk->sk_rmem_alloc);
- if (rmem > (size + (unsigned int)sk->sk_rcvbuf))
- goto uncharge_drop;
+ atomic_add(size, &sk->sk_rmem_alloc);
spin_lock(&list->lock);
err = udp_rmem_schedule(sk, size);
@@ -1543,12 +1555,19 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
*/
sock_skb_set_dropcount(sk, skb);
+ becomes_readable = skb_queue_empty(list);
__skb_queue_tail(list, skb);
spin_unlock(&list->lock);
- if (!sock_flag(sk, SOCK_DEAD))
- INDIRECT_CALL_1(sk->sk_data_ready, sock_def_readable, sk);
-
+ if (!sock_flag(sk, SOCK_DEAD)) {
+ if (becomes_readable ||
+ sk->sk_data_ready != sock_def_readable ||
+ READ_ONCE(sk->sk_peek_off) >= 0)
+ INDIRECT_CALL_1(sk->sk_data_ready,
+ sock_def_readable, sk);
+ else
+ sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN);
+ }
busylock_release(busy);
return 0;
@@ -2056,8 +2075,8 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
drop_reason = SKB_DROP_REASON_PROTO_MEM;
}
UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
- kfree_skb_reason(skb, drop_reason);
- trace_udp_fail_queue_rcv_skb(rc, sk);
+ trace_udp_fail_queue_rcv_skb(rc, sk, skb);
+ sk_skb_reason_drop(sk, skb, drop_reason);
return -1;
}
@@ -2179,7 +2198,7 @@ csum_error:
drop:
__UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
atomic_inc(&sk->sk_drops);
- kfree_skb_reason(skb, drop_reason);
+ sk_skb_reason_drop(sk, skb, drop_reason);
return -1;
}
@@ -2213,7 +2232,7 @@ bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst)
struct dst_entry *old;
if (dst_hold_safe(dst)) {
- old = xchg((__force struct dst_entry **)&sk->sk_rx_dst, dst);
+ old = unrcu_pointer(xchg(&sk->sk_rx_dst, RCU_INITIALIZER(dst)));
dst_release(old);
return old != dst;
}
@@ -2366,7 +2385,7 @@ static int udp_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb,
int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
int proto)
{
- struct sock *sk;
+ struct sock *sk = NULL;
struct udphdr *uh;
unsigned short ulen;
struct rtable *rt = skb_rtable(skb);
@@ -2443,7 +2462,7 @@ no_sk:
* Hmm. We got an UDP packet to a port to which we
* don't wanna listen. Ignore it.
*/
- kfree_skb_reason(skb, drop_reason);
+ sk_skb_reason_drop(sk, skb, drop_reason);
return 0;
short_packet:
@@ -2468,7 +2487,7 @@ csum_error:
__UDP_INC_STATS(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE);
drop:
__UDP_INC_STATS(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE);
- kfree_skb_reason(skb, drop_reason);
+ sk_skb_reason_drop(sk, skb, drop_reason);
return 0;
}
@@ -2600,7 +2619,7 @@ int udp_v4_early_demux(struct sk_buff *skb)
if (!inet_sk(sk)->inet_daddr && in_dev)
return ip_mc_validate_source(skb, iph->daddr,
iph->saddr,
- iph->tos & IPTOS_RT_MASK,
+ iph->tos & INET_DSCP_MASK,
skb->dev, in_dev, &itag);
}
return 0;
@@ -2695,8 +2714,6 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
#ifdef CONFIG_XFRM
case UDP_ENCAP_ESPINUDP:
set_xfrm_gro_udp_encap_rcv(val, sk->sk_family, sk);
- fallthrough;
- case UDP_ENCAP_ESPINUDP_NON_IKE:
#if IS_ENABLED(CONFIG_IPV6)
if (sk->sk_family == AF_INET6)
WRITE_ONCE(up->encap_rcv,
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 3498dd1d0694..d842303587af 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -278,6 +278,17 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
if (gso_skb->len <= sizeof(*uh) + mss)
return ERR_PTR(-EINVAL);
+ if (unlikely(skb_checksum_start(gso_skb) !=
+ skb_transport_header(gso_skb) &&
+ !(skb_shinfo(gso_skb)->gso_type & SKB_GSO_FRAGLIST)))
+ return ERR_PTR(-EINVAL);
+
+ /* We don't know if egress device can segment and checksum the packet
+ * when IPv6 extension headers are present. Fall back to software GSO.
+ */
+ if (gso_skb->ip_summed != CHECKSUM_PARTIAL)
+ features &= ~(NETIF_F_GSO_UDP_L4 | NETIF_F_CSUM_MASK);
+
if (skb_gso_ok(gso_skb, features | NETIF_F_GSO_ROBUST)) {
/* Packet is from an untrusted source, reset gso_segs. */
skb_shinfo(gso_skb)->gso_segs = DIV_ROUND_UP(gso_skb->len - sizeof(*uh),
@@ -357,6 +368,14 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
else
uh->check = gso_make_checksum(seg, ~check) ? : CSUM_MANGLED_0;
+ /* On the TX path, CHECKSUM_NONE and CHECKSUM_UNNECESSARY have the same
+ * meaning. However, check for bad offloads in the GSO stack expects the
+ * latter, if the checksum was calculated in software. To vouch for the
+ * segment skbs we actually need to set it on the gso_skb.
+ */
+ if (gso_skb->ip_summed == CHECKSUM_NONE)
+ gso_skb->ip_summed = CHECKSUM_UNNECESSARY;
+
/* update refcount for the packet */
if (copy_dtor) {
int delta = sum_truesize - gso_skb->truesize;
@@ -433,33 +452,6 @@ out:
return segs;
}
-static int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb)
-{
- if (unlikely(p->len + skb->len >= 65536))
- return -E2BIG;
-
- if (NAPI_GRO_CB(p)->last == p)
- skb_shinfo(p)->frag_list = skb;
- else
- NAPI_GRO_CB(p)->last->next = skb;
-
- skb_pull(skb, skb_gro_offset(skb));
-
- NAPI_GRO_CB(p)->last = skb;
- NAPI_GRO_CB(p)->count++;
- p->data_len += skb->len;
-
- /* sk ownership - if any - completely transferred to the aggregated packet */
- skb->destructor = NULL;
- skb->sk = NULL;
- p->truesize += skb->truesize;
- p->len += skb->len;
-
- NAPI_GRO_CB(skb)->same_flow = 1;
-
- return 0;
-}
-
#define UDP_GRO_CNT_MAX 64
static struct sk_buff *udp_gro_receive_segment(struct list_head *head,
@@ -471,6 +463,7 @@ static struct sk_buff *udp_gro_receive_segment(struct list_head *head,
struct sk_buff *p;
unsigned int ulen;
int ret = 0;
+ int flush;
/* requires non zero csum, for symmetry with GSO */
if (!uh->check) {
@@ -504,13 +497,15 @@ static struct sk_buff *udp_gro_receive_segment(struct list_head *head,
return p;
}
+ flush = gro_receive_network_flush(uh, uh2, p);
+
/* Terminate the flow on len mismatch or if it grow "too much".
* Under small packet flood GRO count could elsewhere grow a lot
* leading to excessive truesize values.
* On len mismatch merge the first packet shorter than gso_size,
* otherwise complete the GRO packet.
*/
- if (ulen > ntohs(uh2->len)) {
+ if (ulen > ntohs(uh2->len) || flush) {
pp = p;
} else {
if (NAPI_GRO_CB(skb)->is_flist) {
@@ -718,7 +713,8 @@ EXPORT_SYMBOL(udp_gro_complete);
INDIRECT_CALLABLE_SCOPE int udp4_gro_complete(struct sk_buff *skb, int nhoff)
{
- const struct iphdr *iph = ip_hdr(skb);
+ const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation];
+ const struct iphdr *iph = (struct iphdr *)(skb->data + offset);
struct udphdr *uh = (struct udphdr *)(skb->data + nhoff);
/* do fraglist only if there is no outer UDP encap (or we already processed it) */
diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c
index 860aff5f8599..619a53eb672d 100644
--- a/net/ipv4/udp_tunnel_core.c
+++ b/net/ipv4/udp_tunnel_core.c
@@ -6,6 +6,7 @@
#include <net/dst_metadata.h>
#include <net/udp.h>
#include <net/udp_tunnel.h>
+#include <net/inet_dscp.h>
int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg,
struct socket **sockp)
@@ -183,7 +184,8 @@ void udp_tunnel_sock_release(struct socket *sock)
EXPORT_SYMBOL_GPL(udp_tunnel_sock_release);
struct metadata_dst *udp_tun_rx_dst(struct sk_buff *skb, unsigned short family,
- __be16 flags, __be64 tunnel_id, int md_size)
+ const unsigned long *flags,
+ __be64 tunnel_id, int md_size)
{
struct metadata_dst *tun_dst;
struct ip_tunnel_info *info;
@@ -199,7 +201,7 @@ struct metadata_dst *udp_tun_rx_dst(struct sk_buff *skb, unsigned short family,
info->key.tp_src = udp_hdr(skb)->source;
info->key.tp_dst = udp_hdr(skb)->dest;
if (udp_hdr(skb)->check)
- info->key.tun_flags |= TUNNEL_CSUM;
+ __set_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags);
return tun_dst;
}
EXPORT_SYMBOL_GPL(udp_tun_rx_dst);
@@ -231,7 +233,7 @@ struct rtable *udp_tunnel_dst_lookup(struct sk_buff *skb,
fl4.saddr = key->u.ipv4.src;
fl4.fl4_dport = dport;
fl4.fl4_sport = sport;
- fl4.flowi4_tos = RT_TOS(tos);
+ fl4.flowi4_tos = tos & INET_DSCP_MASK;
fl4.flowi4_flags = key->flow_flags;
rt = ip_route_output_key(net, &fl4);
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index dae35101d189..a620618cc568 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -63,7 +63,11 @@ int xfrm4_transport_finish(struct sk_buff *skb, int async)
ip_send_check(iph);
if (xo && (xo->flags & XFRM_GRO)) {
- skb_mac_header_rebuild(skb);
+ /* The full l2 header needs to be preserved so that re-injecting the packet at l2
+ * works correctly in the presence of vlan tags.
+ */
+ skb_mac_header_rebuild_full(skb, xo->orig_mac_len);
+ skb_reset_network_header(skb);
skb_reset_transport_header(skb);
return 0;
}
@@ -113,19 +117,6 @@ static int __xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb, bool pull
/* Must be an IKE packet.. pass it through */
return 1;
break;
- case UDP_ENCAP_ESPINUDP_NON_IKE:
- /* Check if this is a keepalive packet. If so, eat it. */
- if (len == 1 && udpdata[0] == 0xff) {
- return -EINVAL;
- } else if (len > 2 * sizeof(u32) + sizeof(struct ip_esp_hdr) &&
- udpdata32[0] == 0 && udpdata32[1] == 0) {
-
- /* ESP Packet with Non-IKE marker */
- len = sizeof(struct udphdr) + 2 * sizeof(u32);
- } else
- /* Must be an IKE packet.. pass it through */
- return 1;
- break;
}
/* At this point we are sure that this is an ESPinUDP packet,
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index c33bca2c3841..0294fef577fa 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -69,7 +69,7 @@ static int xfrm4_get_saddr(struct net *net, int oif,
static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
const struct flowi *fl)
{
- struct rtable *rt = (struct rtable *)xdst->route;
+ struct rtable *rt = dst_rtable(xdst->route);
const struct flowi4 *fl4 = &fl->u.ip4;
xdst->u.rt.rt_iif = fl4->flowi4_iif;
@@ -152,7 +152,6 @@ static struct ctl_table xfrm4_policy_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
- { }
};
static __net_init int xfrm4_net_sysctl_init(struct net *net)
@@ -186,7 +185,7 @@ err_alloc:
static __net_exit void xfrm4_net_sysctl_exit(struct net *net)
{
- struct ctl_table *table;
+ const struct ctl_table *table;
if (!net->ipv4.xfrm4_hdr)
return;