summaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
authorStefan Schmidt <stefan@datenfreihafen.org>2018-08-06 09:04:48 +0200
committerStefan Schmidt <stefan@datenfreihafen.org>2018-08-06 09:04:48 +0200
commita30461080366214b690a367225a48c95d7a6a189 (patch)
tree02cd7cfb8cea14cebe1ab2a1638edd8c9b9e0d3f /net/ipv4
parent811e299f4645588cc7a1b78d97b6847c155324b9 (diff)
parent981467033a37d916649647fa3afe1fe99bba1817 (diff)
downloadlinux-stable-a30461080366214b690a367225a48c95d7a6a189.tar.gz
linux-stable-a30461080366214b690a367225a48c95d7a6a189.tar.bz2
linux-stable-a30461080366214b690a367225a48c95d7a6a189.zip
Merge remote-tracking branch 'net-next/master'
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig4
-rw-r--r--net/ipv4/Makefile2
-rw-r--r--net/ipv4/af_inet.c5
-rw-r--r--net/ipv4/bpfilter/Makefile1
-rw-r--r--net/ipv4/devinet.c11
-rw-r--r--net/ipv4/esp4_offload.c6
-rw-r--r--net/ipv4/fib_frontend.c5
-rw-r--r--net/ipv4/igmp.c54
-rw-r--r--net/ipv4/inet_fragment.c24
-rw-r--r--net/ipv4/ip_forward.c3
-rw-r--r--net/ipv4/ip_fragment.c234
-rw-r--r--net/ipv4/ip_gre.c5
-rw-r--r--net/ipv4/ip_output.c2
-rw-r--r--net/ipv4/ip_sockglue.c11
-rw-r--r--net/ipv4/ipmr.c21
-rw-r--r--net/ipv4/netfilter.c53
-rw-r--r--net/ipv4/netfilter/Kconfig22
-rw-r--r--net/ipv4/netfilter/Makefile6
-rw-r--r--net/ipv4/netfilter/ip_tables.c1
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c472
-rw-r--r--net/ipv4/netfilter/nf_conntrack_proto_icmp.c383
-rw-r--r--net/ipv4/netfilter/nf_tproxy_ipv4.c18
-rw-r--r--net/ipv4/ping.c6
-rw-r--r--net/ipv4/proc.c1
-rw-r--r--net/ipv4/route.c6
-rw-r--r--net/ipv4/sysctl_net_ipv4.c31
-rw-r--r--net/ipv4/tcp.c71
-rw-r--r--net/ipv4/tcp_bbr.c4
-rw-r--r--net/ipv4/tcp_dctcp.c75
-rw-r--r--net/ipv4/tcp_input.c91
-rw-r--r--net/ipv4/tcp_ipv4.c23
-rw-r--r--net/ipv4/tcp_offload.c3
-rw-r--r--net/ipv4/tcp_output.c41
-rw-r--r--net/ipv4/tcp_recovery.c2
-rw-r--r--net/ipv4/tcp_timer.c51
35 files changed, 506 insertions, 1242 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 80dad301361d..32cae39cdff6 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -430,7 +430,7 @@ config INET_DIAG
Support for INET (TCP, DCCP, etc) socket monitoring interface used by
native Linux tools such as ss. ss is included in iproute2, currently
downloadable at:
-
+
http://www.linuxfoundation.org/collaborate/workgroups/networking/iproute2
If unsure, say Y.
@@ -600,7 +600,7 @@ config TCP_CONG_VENO
distinguishing to circumvent the difficult judgment of the packet loss
type. TCP Veno cuts down less congestion window in response to random
loss packets.
- See <http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=1177186>
+ See <http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=1177186>
config TCP_CONG_YEAH
tristate "YeAH TCP"
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index eec9569ffa5c..7446b98661d8 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -43,7 +43,7 @@ obj-$(CONFIG_INET_XFRM_MODE_TRANSPORT) += xfrm4_mode_transport.o
obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o
obj-$(CONFIG_IP_PNP) += ipconfig.o
obj-$(CONFIG_NETFILTER) += netfilter.o netfilter/
-obj-$(CONFIG_INET_DIAG) += inet_diag.o
+obj-$(CONFIG_INET_DIAG) += inet_diag.o
obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o
obj-$(CONFIG_INET_RAW_DIAG) += raw_diag.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index c716be13d58c..20fda8fb8ffd 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -229,6 +229,7 @@ int inet_listen(struct socket *sock, int backlog)
err = inet_csk_listen_start(sk, backlog);
if (err)
goto out;
+ tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_LISTEN_CB, 0, NULL);
}
sk->sk_max_ack_backlog = backlog;
err = 0;
@@ -485,8 +486,7 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
* is temporarily down)
*/
err = -EADDRNOTAVAIL;
- if (!net->ipv4.sysctl_ip_nonlocal_bind &&
- !(inet->freebind || inet->transparent) &&
+ if (!inet_can_nonlocal_bind(net, inet) &&
addr->sin_addr.s_addr != htonl(INADDR_ANY) &&
chk_addr_ret != RTN_LOCAL &&
chk_addr_ret != RTN_MULTICAST &&
@@ -1801,6 +1801,7 @@ static __net_init int inet_init_net(struct net *net)
* We set them here, in case sysctl is not compiled.
*/
net->ipv4.sysctl_ip_default_ttl = IPDEFTTL;
+ net->ipv4.sysctl_ip_fwd_update_priority = 1;
net->ipv4.sysctl_ip_dynaddr = 0;
net->ipv4.sysctl_ip_early_demux = 1;
net->ipv4.sysctl_udp_early_demux = 1;
diff --git a/net/ipv4/bpfilter/Makefile b/net/ipv4/bpfilter/Makefile
index ce262d76cc48..e9e42f99725e 100644
--- a/net/ipv4/bpfilter/Makefile
+++ b/net/ipv4/bpfilter/Makefile
@@ -1,2 +1 @@
obj-$(CONFIG_BPFILTER) += sockopt.o
-
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index d7585ab1a77a..ea4bd8a52422 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1827,6 +1827,8 @@ static int inet_netconf_msgsize_devconf(int type)
size += nla_total_size(4);
if (all || type == NETCONFA_MC_FORWARDING)
size += nla_total_size(4);
+ if (all || type == NETCONFA_BC_FORWARDING)
+ size += nla_total_size(4);
if (all || type == NETCONFA_PROXY_NEIGH)
size += nla_total_size(4);
if (all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN)
@@ -1873,6 +1875,10 @@ static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
nla_put_s32(skb, NETCONFA_MC_FORWARDING,
IPV4_DEVCONF(*devconf, MC_FORWARDING)) < 0)
goto nla_put_failure;
+ if ((all || type == NETCONFA_BC_FORWARDING) &&
+ nla_put_s32(skb, NETCONFA_BC_FORWARDING,
+ IPV4_DEVCONF(*devconf, BC_FORWARDING)) < 0)
+ goto nla_put_failure;
if ((all || type == NETCONFA_PROXY_NEIGH) &&
nla_put_s32(skb, NETCONFA_PROXY_NEIGH,
IPV4_DEVCONF(*devconf, PROXY_ARP)) < 0)
@@ -2143,6 +2149,10 @@ static int devinet_conf_proc(struct ctl_table *ctl, int write,
if ((new_value == 0) && (old_value != 0))
rt_cache_flush(net);
+ if (i == IPV4_DEVCONF_BC_FORWARDING - 1 &&
+ new_value != old_value)
+ rt_cache_flush(net);
+
if (i == IPV4_DEVCONF_RP_FILTER - 1 &&
new_value != old_value) {
ifindex = devinet_conf_ifindex(net, cnf);
@@ -2259,6 +2269,7 @@ static struct devinet_sysctl_table {
DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding",
devinet_sysctl_forward),
DEVINET_SYSCTL_RO_ENTRY(MC_FORWARDING, "mc_forwarding"),
+ DEVINET_SYSCTL_RW_ENTRY(BC_FORWARDING, "bc_forwarding"),
DEVINET_SYSCTL_RW_ENTRY(ACCEPT_REDIRECTS, "accept_redirects"),
DEVINET_SYSCTL_RW_ENTRY(SECURE_REDIRECTS, "secure_redirects"),
diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c
index bbeecd13e534..58834a10c0be 100644
--- a/net/ipv4/esp4_offload.c
+++ b/net/ipv4/esp4_offload.c
@@ -135,8 +135,7 @@ static struct sk_buff *esp4_gso_segment(struct sk_buff *skb,
skb->encap_hdr_csum = 1;
- if (!(features & NETIF_F_HW_ESP) || !x->xso.offload_handle ||
- (x->xso.dev != skb->dev))
+ if (!(features & NETIF_F_HW_ESP) || x->xso.dev != skb->dev)
esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK);
else if (!(features & NETIF_F_HW_ESP_TX_CSUM))
esp_features = features & ~NETIF_F_CSUM_MASK;
@@ -179,8 +178,7 @@ static int esp_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features_
if (!xo)
return -EINVAL;
- if (!(features & NETIF_F_HW_ESP) || !x->xso.offload_handle ||
- (x->xso.dev != skb->dev)) {
+ if (!(features & NETIF_F_HW_ESP) || x->xso.dev != skb->dev) {
xo->flags |= CRYPTO_FALLBACK;
hw_offload = false;
}
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index b21833651394..2998b0e47d4b 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -292,18 +292,19 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb)
return ip_hdr(skb)->daddr;
in_dev = __in_dev_get_rcu(dev);
- BUG_ON(!in_dev);
net = dev_net(dev);
scope = RT_SCOPE_UNIVERSE;
if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) {
+ bool vmark = in_dev && IN_DEV_SRC_VMARK(in_dev);
struct flowi4 fl4 = {
.flowi4_iif = LOOPBACK_IFINDEX,
+ .flowi4_oif = l3mdev_master_ifindex_rcu(dev),
.daddr = ip_hdr(skb)->saddr,
.flowi4_tos = RT_TOS(ip_hdr(skb)->tos),
.flowi4_scope = scope,
- .flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0,
+ .flowi4_mark = vmark ? skb->mark : 0,
};
if (!fib_lookup(net, &fl4, &res, 0))
return FIB_RES_PREFSRC(net, res);
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 85b617b655bc..cf75f8944b05 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1200,13 +1200,13 @@ static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im)
spin_lock_bh(&im->lock);
if (pmc) {
im->interface = pmc->interface;
- im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
- im->sfmode = pmc->sfmode;
- if (pmc->sfmode == MCAST_INCLUDE) {
+ if (im->sfmode == MCAST_INCLUDE) {
im->tomb = pmc->tomb;
im->sources = pmc->sources;
for (psf = im->sources; psf; psf = psf->sf_next)
- psf->sf_crcount = im->crcount;
+ psf->sf_crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
+ } else {
+ im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
}
in_dev_put(pmc->interface);
kfree(pmc);
@@ -1316,7 +1316,13 @@ static void igmp_group_added(struct ip_mc_list *im)
}
/* else, v3 */
- im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
+ /* Based on RFC3376 5.1, for newly added INCLUDE SSM, we should
+ * not send filter-mode change record as the mode should be from
+ * IN() to IN(A).
+ */
+ if (im->sfmode == MCAST_EXCLUDE)
+ im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
+
igmp_ifc_event(in_dev);
#endif
}
@@ -1381,8 +1387,8 @@ static void ip_mc_hash_remove(struct in_device *in_dev,
/*
* A socket has joined a multicast group on device dev.
*/
-
-void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
+static void __ip_mc_inc_group(struct in_device *in_dev, __be32 addr,
+ unsigned int mode)
{
struct ip_mc_list *im;
#ifdef CONFIG_IP_MULTICAST
@@ -1394,7 +1400,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
for_each_pmc_rtnl(in_dev, im) {
if (im->multiaddr == addr) {
im->users++;
- ip_mc_add_src(in_dev, &addr, MCAST_EXCLUDE, 0, NULL, 0);
+ ip_mc_add_src(in_dev, &addr, mode, 0, NULL, 0);
goto out;
}
}
@@ -1408,8 +1414,8 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
in_dev_hold(in_dev);
im->multiaddr = addr;
/* initial mode is (EX, empty) */
- im->sfmode = MCAST_EXCLUDE;
- im->sfcount[MCAST_EXCLUDE] = 1;
+ im->sfmode = mode;
+ im->sfcount[mode] = 1;
refcount_set(&im->refcnt, 1);
spin_lock_init(&im->lock);
#ifdef CONFIG_IP_MULTICAST
@@ -1432,6 +1438,11 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
out:
return;
}
+
+void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
+{
+ __ip_mc_inc_group(in_dev, addr, MCAST_EXCLUDE);
+}
EXPORT_SYMBOL(ip_mc_inc_group);
static int ip_mc_check_iphdr(struct sk_buff *skb)
@@ -2130,8 +2141,8 @@ static void ip_mc_clear_src(struct ip_mc_list *pmc)
/* Join a multicast group
*/
-
-int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr)
+static int __ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr,
+ unsigned int mode)
{
__be32 addr = imr->imr_multiaddr.s_addr;
struct ip_mc_socklist *iml, *i;
@@ -2172,15 +2183,30 @@ int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr)
memcpy(&iml->multi, imr, sizeof(*imr));
iml->next_rcu = inet->mc_list;
iml->sflist = NULL;
- iml->sfmode = MCAST_EXCLUDE;
+ iml->sfmode = mode;
rcu_assign_pointer(inet->mc_list, iml);
- ip_mc_inc_group(in_dev, addr);
+ __ip_mc_inc_group(in_dev, addr, mode);
err = 0;
done:
return err;
}
+
+/* Join ASM (Any-Source Multicast) group
+ */
+int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr)
+{
+ return __ip_mc_join_group(sk, imr, MCAST_EXCLUDE);
+}
EXPORT_SYMBOL(ip_mc_join_group);
+/* Join SSM (Source-Specific Multicast) group
+ */
+int ip_mc_join_group_ssm(struct sock *sk, struct ip_mreqn *imr,
+ unsigned int mode)
+{
+ return __ip_mc_join_group(sk, imr, mode);
+}
+
static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml,
struct in_device *in_dev)
{
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 316518f87294..6d258a5669e7 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -91,7 +91,7 @@ static void inet_frags_free_cb(void *ptr, void *arg)
void inet_frags_exit_net(struct netns_frags *nf)
{
- nf->low_thresh = 0; /* prevent creation of new frags */
+ nf->high_thresh = 0; /* prevent creation of new frags */
rhashtable_free_and_destroy(&nf->rhashtable, inet_frags_free_cb, NULL);
}
@@ -137,12 +137,16 @@ void inet_frag_destroy(struct inet_frag_queue *q)
fp = q->fragments;
nf = q->net;
f = nf->f;
- while (fp) {
- struct sk_buff *xp = fp->next;
-
- sum_truesize += fp->truesize;
- kfree_skb(fp);
- fp = xp;
+ if (fp) {
+ do {
+ struct sk_buff *xp = fp->next;
+
+ sum_truesize += fp->truesize;
+ kfree_skb(fp);
+ fp = xp;
+ } while (fp);
+ } else {
+ sum_truesize = skb_rbtree_purge(&q->rb_fragments);
}
sum = sum_truesize + f->qsize;
@@ -158,9 +162,6 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
{
struct inet_frag_queue *q;
- if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh)
- return NULL;
-
q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC);
if (!q)
return NULL;
@@ -205,6 +206,9 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key)
{
struct inet_frag_queue *fq;
+ if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh)
+ return NULL;
+
rcu_read_lock();
fq = rhashtable_lookup(&nf->rhashtable, key, nf->f->rhash_params);
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index b54b948b0596..32662e9e5d21 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -143,7 +143,8 @@ int ip_forward(struct sk_buff *skb)
!skb_sec_path(skb))
ip_rt_send_redirect(skb);
- skb->priority = rt_tos2priority(iph->tos);
+ if (net->ipv4.sysctl_ip_fwd_update_priority)
+ skb->priority = rt_tos2priority(iph->tos);
return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD,
net, NULL, skb, skb->dev, rt->dst.dev,
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 8e9528ebaa8e..0e8f8de77e71 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -136,7 +136,7 @@ static void ip_expire(struct timer_list *t)
{
struct inet_frag_queue *frag = from_timer(frag, t, timer);
const struct iphdr *iph;
- struct sk_buff *head;
+ struct sk_buff *head = NULL;
struct net *net;
struct ipq *qp;
int err;
@@ -152,14 +152,31 @@ static void ip_expire(struct timer_list *t)
ipq_kill(qp);
__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
-
- head = qp->q.fragments;
-
__IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT);
- if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !head)
+ if (!qp->q.flags & INET_FRAG_FIRST_IN)
goto out;
+ /* sk_buff::dev and sk_buff::rbnode are unionized. So we
+ * pull the head out of the tree in order to be able to
+ * deal with head->dev.
+ */
+ if (qp->q.fragments) {
+ head = qp->q.fragments;
+ qp->q.fragments = head->next;
+ } else {
+ head = skb_rb_first(&qp->q.rb_fragments);
+ if (!head)
+ goto out;
+ rb_erase(&head->rbnode, &qp->q.rb_fragments);
+ memset(&head->rbnode, 0, sizeof(head->rbnode));
+ barrier();
+ }
+ if (head == qp->q.fragments_tail)
+ qp->q.fragments_tail = NULL;
+
+ sub_frag_mem_limit(qp->q.net, head->truesize);
+
head->dev = dev_get_by_index_rcu(net, qp->iif);
if (!head->dev)
goto out;
@@ -179,16 +196,16 @@ static void ip_expire(struct timer_list *t)
(skb_rtable(head)->rt_type != RTN_LOCAL))
goto out;
- skb_get(head);
spin_unlock(&qp->q.lock);
icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
- kfree_skb(head);
goto out_rcu_unlock;
out:
spin_unlock(&qp->q.lock);
out_rcu_unlock:
rcu_read_unlock();
+ if (head)
+ kfree_skb(head);
ipq_put(qp);
}
@@ -231,7 +248,7 @@ static int ip_frag_too_far(struct ipq *qp)
end = atomic_inc_return(&peer->rid);
qp->rid = end;
- rc = qp->q.fragments && (end - start) > max;
+ rc = qp->q.fragments_tail && (end - start) > max;
if (rc) {
struct net *net;
@@ -245,7 +262,6 @@ static int ip_frag_too_far(struct ipq *qp)
static int ip_frag_reinit(struct ipq *qp)
{
- struct sk_buff *fp;
unsigned int sum_truesize = 0;
if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) {
@@ -253,20 +269,14 @@ static int ip_frag_reinit(struct ipq *qp)
return -ETIMEDOUT;
}
- fp = qp->q.fragments;
- do {
- struct sk_buff *xp = fp->next;
-
- sum_truesize += fp->truesize;
- kfree_skb(fp);
- fp = xp;
- } while (fp);
+ sum_truesize = skb_rbtree_purge(&qp->q.rb_fragments);
sub_frag_mem_limit(qp->q.net, sum_truesize);
qp->q.flags = 0;
qp->q.len = 0;
qp->q.meat = 0;
qp->q.fragments = NULL;
+ qp->q.rb_fragments = RB_ROOT;
qp->q.fragments_tail = NULL;
qp->iif = 0;
qp->ecn = 0;
@@ -277,7 +287,9 @@ static int ip_frag_reinit(struct ipq *qp)
/* Add new segment to existing queue. */
static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
{
- struct sk_buff *prev, *next;
+ struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
+ struct rb_node **rbn, *parent;
+ struct sk_buff *skb1;
struct net_device *dev;
unsigned int fragsize;
int flags, offset;
@@ -340,95 +352,58 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
if (err)
goto err;
- /* Find out which fragments are in front and at the back of us
- * in the chain of fragments so far. We must know where to put
- * this fragment, right?
- */
- prev = qp->q.fragments_tail;
- if (!prev || prev->ip_defrag_offset < offset) {
- next = NULL;
- goto found;
- }
- prev = NULL;
- for (next = qp->q.fragments; next != NULL; next = next->next) {
- if (next->ip_defrag_offset >= offset)
- break; /* bingo! */
- prev = next;
- }
+ /* Note : skb->rbnode and skb->dev share the same location. */
+ dev = skb->dev;
+ /* Makes sure compiler wont do silly aliasing games */
+ barrier();
-found:
- /* We found where to put this one. Check for overlap with
- * preceding fragment, and, if needed, align things so that
- * any overlaps are eliminated.
+ /* RFC5722, Section 4, amended by Errata ID : 3089
+ * When reassembling an IPv6 datagram, if
+ * one or more its constituent fragments is determined to be an
+ * overlapping fragment, the entire datagram (and any constituent
+ * fragments) MUST be silently discarded.
+ *
+ * We do the same here for IPv4 (and increment an snmp counter).
*/
- if (prev) {
- int i = (prev->ip_defrag_offset + prev->len) - offset;
- if (i > 0) {
- offset += i;
- err = -EINVAL;
- if (end <= offset)
- goto err;
- err = -ENOMEM;
- if (!pskb_pull(skb, i))
- goto err;
- if (skb->ip_summed != CHECKSUM_UNNECESSARY)
- skb->ip_summed = CHECKSUM_NONE;
- }
- }
-
- err = -ENOMEM;
-
- while (next && next->ip_defrag_offset < end) {
- int i = end - next->ip_defrag_offset; /* overlap is 'i' bytes */
-
- if (i < next->len) {
- /* Eat head of the next overlapped fragment
- * and leave the loop. The next ones cannot overlap.
- */
- if (!pskb_pull(next, i))
- goto err;
- next->ip_defrag_offset += i;
- qp->q.meat -= i;
- if (next->ip_summed != CHECKSUM_UNNECESSARY)
- next->ip_summed = CHECKSUM_NONE;
- break;
- } else {
- struct sk_buff *free_it = next;
-
- /* Old fragment is completely overridden with
- * new one drop it.
- */
- next = next->next;
-
- if (prev)
- prev->next = next;
- else
- qp->q.fragments = next;
-
- qp->q.meat -= free_it->len;
- sub_frag_mem_limit(qp->q.net, free_it->truesize);
- kfree_skb(free_it);
- }
+ /* Find out where to put this fragment. */
+ skb1 = qp->q.fragments_tail;
+ if (!skb1) {
+ /* This is the first fragment we've received. */
+ rb_link_node(&skb->rbnode, NULL, &qp->q.rb_fragments.rb_node);
+ qp->q.fragments_tail = skb;
+ } else if ((skb1->ip_defrag_offset + skb1->len) < end) {
+ /* This is the common/special case: skb goes to the end. */
+ /* Detect and discard overlaps. */
+ if (offset < (skb1->ip_defrag_offset + skb1->len))
+ goto discard_qp;
+ /* Insert after skb1. */
+ rb_link_node(&skb->rbnode, &skb1->rbnode, &skb1->rbnode.rb_right);
+ qp->q.fragments_tail = skb;
+ } else {
+ /* Binary search. Note that skb can become the first fragment, but
+ * not the last (covered above). */
+ rbn = &qp->q.rb_fragments.rb_node;
+ do {
+ parent = *rbn;
+ skb1 = rb_to_skb(parent);
+ if (end <= skb1->ip_defrag_offset)
+ rbn = &parent->rb_left;
+ else if (offset >= skb1->ip_defrag_offset + skb1->len)
+ rbn = &parent->rb_right;
+ else /* Found an overlap with skb1. */
+ goto discard_qp;
+ } while (*rbn);
+ /* Here we have parent properly set, and rbn pointing to
+ * one of its NULL left/right children. Insert skb. */
+ rb_link_node(&skb->rbnode, parent, rbn);
}
+ rb_insert_color(&skb->rbnode, &qp->q.rb_fragments);
- /* Note : skb->ip_defrag_offset and skb->dev share the same location */
- dev = skb->dev;
if (dev)
qp->iif = dev->ifindex;
- /* Makes sure compiler wont do silly aliasing games */
- barrier();
skb->ip_defrag_offset = offset;
- /* Insert this fragment in the chain of fragments. */
- skb->next = next;
- if (!next)
- qp->q.fragments_tail = skb;
- if (prev)
- prev->next = skb;
- else
- qp->q.fragments = skb;
-
qp->q.stamp = skb->tstamp;
qp->q.meat += skb->len;
qp->ecn |= ecn;
@@ -450,7 +425,7 @@ found:
unsigned long orefdst = skb->_skb_refdst;
skb->_skb_refdst = 0UL;
- err = ip_frag_reasm(qp, prev, dev);
+ err = ip_frag_reasm(qp, skb, dev);
skb->_skb_refdst = orefdst;
return err;
}
@@ -458,20 +433,24 @@ found:
skb_dst_drop(skb);
return -EINPROGRESS;
+discard_qp:
+ inet_frag_kill(&qp->q);
+ err = -EINVAL;
+ __IP_INC_STATS(net, IPSTATS_MIB_REASM_OVERLAPS);
err:
kfree_skb(skb);
return err;
}
-
/* Build a new IP datagram from all its fragments. */
-
-static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
+static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
struct net_device *dev)
{
struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
struct iphdr *iph;
- struct sk_buff *fp, *head = qp->q.fragments;
+ struct sk_buff *fp, *head = skb_rb_first(&qp->q.rb_fragments);
+ struct sk_buff **nextp; /* To build frag_list. */
+ struct rb_node *rbn;
int len;
int ihlen;
int err;
@@ -485,25 +464,20 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
goto out_fail;
}
/* Make the one we just received the head. */
- if (prev) {
- head = prev->next;
- fp = skb_clone(head, GFP_ATOMIC);
+ if (head != skb) {
+ fp = skb_clone(skb, GFP_ATOMIC);
if (!fp)
goto out_nomem;
-
- fp->next = head->next;
- if (!fp->next)
+ rb_replace_node(&skb->rbnode, &fp->rbnode, &qp->q.rb_fragments);
+ if (qp->q.fragments_tail == skb)
qp->q.fragments_tail = fp;
- prev->next = fp;
-
- skb_morph(head, qp->q.fragments);
- head->next = qp->q.fragments->next;
-
- consume_skb(qp->q.fragments);
- qp->q.fragments = head;
+ skb_morph(skb, head);
+ rb_replace_node(&head->rbnode, &skb->rbnode,
+ &qp->q.rb_fragments);
+ consume_skb(head);
+ head = skb;
}
- WARN_ON(!head);
WARN_ON(head->ip_defrag_offset != 0);
/* Allocate a new buffer for the datagram. */
@@ -528,24 +502,35 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
clone = alloc_skb(0, GFP_ATOMIC);
if (!clone)
goto out_nomem;
- clone->next = head->next;
- head->next = clone;
skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
skb_frag_list_init(head);
for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
plen += skb_frag_size(&skb_shinfo(head)->frags[i]);
clone->len = clone->data_len = head->data_len - plen;
- head->data_len -= clone->len;
- head->len -= clone->len;
+ skb->truesize += clone->truesize;
clone->csum = 0;
clone->ip_summed = head->ip_summed;
add_frag_mem_limit(qp->q.net, clone->truesize);
+ skb_shinfo(head)->frag_list = clone;
+ nextp = &clone->next;
+ } else {
+ nextp = &skb_shinfo(head)->frag_list;
}
- skb_shinfo(head)->frag_list = head->next;
skb_push(head, head->data - skb_network_header(head));
- for (fp=head->next; fp; fp = fp->next) {
+ /* Traverse the tree in order, to build frag_list. */
+ rbn = rb_next(&head->rbnode);
+ rb_erase(&head->rbnode, &qp->q.rb_fragments);
+ while (rbn) {
+ struct rb_node *rbnext = rb_next(rbn);
+ fp = rb_to_skb(rbn);
+ rb_erase(rbn, &qp->q.rb_fragments);
+ rbn = rbnext;
+ *nextp = fp;
+ nextp = &fp->next;
+ fp->prev = NULL;
+ memset(&fp->rbnode, 0, sizeof(fp->rbnode));
head->data_len += fp->len;
head->len += fp->len;
if (head->ip_summed != fp->ip_summed)
@@ -556,7 +541,9 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
}
sub_frag_mem_limit(qp->q.net, head->truesize);
+ *nextp = NULL;
head->next = NULL;
+ head->prev = NULL;
head->dev = dev;
head->tstamp = qp->q.stamp;
IPCB(head)->frag_max_size = max(qp->max_df_size, qp->q.max_size);
@@ -584,6 +571,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
__IP_INC_STATS(net, IPSTATS_MIB_REASMOKS);
qp->q.fragments = NULL;
+ qp->q.rb_fragments = RB_ROOT;
qp->q.fragments_tail = NULL;
return 0;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index c8ca5d8f0f75..51a5d06085ac 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -985,7 +985,6 @@ static void ipgre_tunnel_setup(struct net_device *dev)
static void __gre_tunnel_init(struct net_device *dev)
{
struct ip_tunnel *tunnel;
- int t_hlen;
tunnel = netdev_priv(dev);
tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags);
@@ -993,8 +992,6 @@ static void __gre_tunnel_init(struct net_device *dev)
tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen;
- t_hlen = tunnel->hlen + sizeof(struct iphdr);
-
dev->features |= GRE_FEATURES;
dev->hw_features |= GRE_FEATURES;
@@ -1304,13 +1301,11 @@ static const struct net_device_ops gre_tap_netdev_ops = {
static int erspan_tunnel_init(struct net_device *dev)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
- int t_hlen;
tunnel->tun_hlen = 8;
tunnel->parms.iph.protocol = IPPROTO_GRE;
tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen +
erspan_hdr_len(tunnel->erspan_ver);
- t_hlen = tunnel->hlen + sizeof(struct iphdr);
dev->features |= GRE_FEATURES;
dev->hw_features |= GRE_FEATURES;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index e2b6bd478afb..9c4e72e9c60a 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -524,6 +524,8 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
to->dev = from->dev;
to->mark = from->mark;
+ skb_copy_hash(to, from);
+
/* Copy the flags to each fragment. */
IPCB(to)->flags = IPCB(from)->flags;
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index fc32fdbeefa6..c0fe5ad996f2 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -150,15 +150,18 @@ static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb)
{
struct sockaddr_in sin;
const struct iphdr *iph = ip_hdr(skb);
- __be16 *ports = (__be16 *)skb_transport_header(skb);
+ __be16 *ports;
+ int end;
- if (skb_transport_offset(skb) + 4 > (int)skb->len)
+ end = skb_transport_offset(skb) + 4;
+ if (end > 0 && !pskb_may_pull(skb, end))
return;
/* All current transport protocols have the port numbers in the
* first four bytes of the transport header and this function is
* written with this assumption in mind.
*/
+ ports = (__be16 *)skb_transport_header(skb);
sin.sin_family = AF_INET;
sin.sin_addr.s_addr = iph->daddr;
@@ -984,7 +987,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
mreq.imr_multiaddr.s_addr = mreqs.imr_multiaddr;
mreq.imr_address.s_addr = mreqs.imr_interface;
mreq.imr_ifindex = 0;
- err = ip_mc_join_group(sk, &mreq);
+ err = ip_mc_join_group_ssm(sk, &mreq, MCAST_INCLUDE);
if (err && err != -EADDRINUSE)
break;
omode = MCAST_INCLUDE;
@@ -1061,7 +1064,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
mreq.imr_multiaddr = psin->sin_addr;
mreq.imr_address.s_addr = 0;
mreq.imr_ifindex = greqs.gsr_interface;
- err = ip_mc_join_group(sk, &mreq);
+ err = ip_mc_join_group_ssm(sk, &mreq, MCAST_INCLUDE);
if (err && err != -EADDRINUSE)
break;
greqs.gsr_interface = mreq.imr_ifindex;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 82f914122f1b..5660adcf7a04 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1052,7 +1052,7 @@ static int ipmr_cache_report(struct mr_table *mrt,
struct sk_buff *skb;
int ret;
- if (assert == IGMPMSG_WHOLEPKT)
+ if (assert == IGMPMSG_WHOLEPKT || assert == IGMPMSG_WRVIFWHOLE)
skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
else
skb = alloc_skb(128, GFP_ATOMIC);
@@ -1060,7 +1060,7 @@ static int ipmr_cache_report(struct mr_table *mrt,
if (!skb)
return -ENOBUFS;
- if (assert == IGMPMSG_WHOLEPKT) {
+ if (assert == IGMPMSG_WHOLEPKT || assert == IGMPMSG_WRVIFWHOLE) {
/* Ugly, but we have no choice with this interface.
* Duplicate old header, fix ihl, length etc.
* And all this only to mangle msg->im_msgtype and
@@ -1071,9 +1071,12 @@ static int ipmr_cache_report(struct mr_table *mrt,
skb_reset_transport_header(skb);
msg = (struct igmpmsg *)skb_network_header(skb);
memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr));
- msg->im_msgtype = IGMPMSG_WHOLEPKT;
+ msg->im_msgtype = assert;
msg->im_mbz = 0;
- msg->im_vif = mrt->mroute_reg_vif_num;
+ if (assert == IGMPMSG_WRVIFWHOLE)
+ msg->im_vif = vifi;
+ else
+ msg->im_vif = mrt->mroute_reg_vif_num;
ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2;
ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) +
sizeof(struct iphdr));
@@ -1372,6 +1375,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval,
struct mr_table *mrt;
struct vifctl vif;
struct mfcctl mfc;
+ bool do_wrvifwhole;
u32 uval;
/* There's one exception to the lock - MRT_DONE which needs to unlock */
@@ -1502,10 +1506,12 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval,
break;
}
+ do_wrvifwhole = (val == IGMPMSG_WRVIFWHOLE);
val = !!val;
if (val != mrt->mroute_do_pim) {
mrt->mroute_do_pim = val;
mrt->mroute_do_assert = val;
+ mrt->mroute_do_wrvifwhole = do_wrvifwhole;
}
break;
case MRT_TABLE:
@@ -1983,6 +1989,9 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt,
MFC_ASSERT_THRESH)) {
c->_c.mfc_un.res.last_assert = jiffies;
ipmr_cache_report(mrt, skb, true_vifi, IGMPMSG_WRONGVIF);
+ if (mrt->mroute_do_wrvifwhole)
+ ipmr_cache_report(mrt, skb, true_vifi,
+ IGMPMSG_WRVIFWHOLE);
}
goto dont_forward;
}
@@ -2659,7 +2668,9 @@ static bool ipmr_fill_table(struct mr_table *mrt, struct sk_buff *skb)
mrt->mroute_reg_vif_num) ||
nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_ASSERT,
mrt->mroute_do_assert) ||
- nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_PIM, mrt->mroute_do_pim))
+ nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_PIM, mrt->mroute_do_pim) ||
+ nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_WRVIFWHOLE,
+ mrt->mroute_do_wrvifwhole))
return false;
return true;
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index e6774ccb7731..8d2e5dc9a827 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -98,59 +98,6 @@ int nf_ip_reroute(struct sk_buff *skb, const struct nf_queue_entry *entry)
}
EXPORT_SYMBOL_GPL(nf_ip_reroute);
-__sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook,
- unsigned int dataoff, u_int8_t protocol)
-{
- const struct iphdr *iph = ip_hdr(skb);
- __sum16 csum = 0;
-
- switch (skb->ip_summed) {
- case CHECKSUM_COMPLETE:
- if (hook != NF_INET_PRE_ROUTING && hook != NF_INET_LOCAL_IN)
- break;
- if ((protocol == 0 && !csum_fold(skb->csum)) ||
- !csum_tcpudp_magic(iph->saddr, iph->daddr,
- skb->len - dataoff, protocol,
- skb->csum)) {
- skb->ip_summed = CHECKSUM_UNNECESSARY;
- break;
- }
- /* fall through */
- case CHECKSUM_NONE:
- if (protocol == 0)
- skb->csum = 0;
- else
- skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
- skb->len - dataoff,
- protocol, 0);
- csum = __skb_checksum_complete(skb);
- }
- return csum;
-}
-EXPORT_SYMBOL(nf_ip_checksum);
-
-__sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook,
- unsigned int dataoff, unsigned int len,
- u_int8_t protocol)
-{
- const struct iphdr *iph = ip_hdr(skb);
- __sum16 csum = 0;
-
- switch (skb->ip_summed) {
- case CHECKSUM_COMPLETE:
- if (len == skb->len - dataoff)
- return nf_ip_checksum(skb, hook, dataoff, protocol);
- /* fall through */
- case CHECKSUM_NONE:
- skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, protocol,
- skb->len - dataoff, 0);
- skb->ip_summed = CHECKSUM_NONE;
- return __skb_checksum_complete_head(skb, dataoff + len);
- }
- return csum;
-}
-EXPORT_SYMBOL_GPL(nf_ip_checksum_partial);
-
int nf_ip_route(struct net *net, struct dst_entry **dst, struct flowi *fl,
bool strict __always_unused)
{
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index bbfc356cb1b5..d9504adc47b3 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -9,22 +9,6 @@ config NF_DEFRAG_IPV4
tristate
default n
-config NF_CONNTRACK_IPV4
- tristate "IPv4 connection tracking support (required for NAT)"
- depends on NF_CONNTRACK
- default m if NETFILTER_ADVANCED=n
- select NF_DEFRAG_IPV4
- ---help---
- Connection tracking keeps a record of what packets have passed
- through your machine, in order to figure out how they are related
- into connections.
-
- This is IPv4 support on Layer 3 independent connection tracking.
- Layer 3 independent connection tracking is experimental scheme
- which generalize ip_conntrack to support other layer 3 protocols.
-
- To compile it as a module, choose M here. If unsure, say N.
-
config NF_SOCKET_IPV4
tristate "IPv4 socket lookup support"
help
@@ -112,7 +96,7 @@ config NF_REJECT_IPV4
config NF_NAT_IPV4
tristate "IPv4 NAT"
- depends on NF_CONNTRACK_IPV4
+ depends on NF_CONNTRACK
default m if NETFILTER_ADVANCED=n
select NF_NAT
help
@@ -279,7 +263,7 @@ config IP_NF_TARGET_SYNPROXY
# NAT + specific targets: nf_conntrack
config IP_NF_NAT
tristate "iptables NAT support"
- depends on NF_CONNTRACK_IPV4
+ depends on NF_CONNTRACK
default m if NETFILTER_ADVANCED=n
select NF_NAT
select NF_NAT_IPV4
@@ -340,7 +324,7 @@ config IP_NF_MANGLE
config IP_NF_TARGET_CLUSTERIP
tristate "CLUSTERIP target support"
depends on IP_NF_MANGLE
- depends on NF_CONNTRACK_IPV4
+ depends on NF_CONNTRACK
depends on NETFILTER_ADVANCED
select NF_CONNTRACK_MARK
select NETFILTER_FAMILY_ARP
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 8394c17c269f..367993adf4d3 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -3,12 +3,6 @@
# Makefile for the netfilter modules on top of IPv4.
#
-# objects for l3 independent conntrack
-nf_conntrack_ipv4-y := nf_conntrack_l3proto_ipv4.o nf_conntrack_proto_icmp.o
-
-# connection tracking
-obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o
-
nf_nat_ipv4-y := nf_nat_l3proto_ipv4.o nf_nat_proto_icmp.o
nf_nat_ipv4-$(CONFIG_NF_NAT_MASQUERADE_IPV4) += nf_nat_masquerade_ipv4.o
obj-$(CONFIG_NF_NAT_IPV4) += nf_nat_ipv4.o
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index ca0dad90803a..e77872c93c20 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -1898,6 +1898,7 @@ static struct xt_match ipt_builtin_mt[] __read_mostly = {
.checkentry = icmp_checkentry,
.proto = IPPROTO_ICMP,
.family = NFPROTO_IPV4,
+ .me = THIS_MODULE,
},
};
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
deleted file mode 100644
index 9db988f9a4d7..000000000000
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ /dev/null
@@ -1,472 +0,0 @@
-
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/types.h>
-#include <linux/ip.h>
-#include <linux/netfilter.h>
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/icmp.h>
-#include <linux/sysctl.h>
-#include <net/route.h>
-#include <net/ip.h>
-
-#include <linux/netfilter_ipv4.h>
-#include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_conntrack_helper.h>
-#include <net/netfilter/nf_conntrack_l4proto.h>
-#include <net/netfilter/nf_conntrack_l3proto.h>
-#include <net/netfilter/nf_conntrack_zones.h>
-#include <net/netfilter/nf_conntrack_core.h>
-#include <net/netfilter/nf_conntrack_seqadj.h>
-#include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
-#include <net/netfilter/nf_nat_helper.h>
-#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
-#include <net/netfilter/nf_log.h>
-
-static int conntrack4_net_id __read_mostly;
-static DEFINE_MUTEX(register_ipv4_hooks);
-
-struct conntrack4_net {
- unsigned int users;
-};
-
-static bool ipv4_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff,
- struct nf_conntrack_tuple *tuple)
-{
- const __be32 *ap;
- __be32 _addrs[2];
- ap = skb_header_pointer(skb, nhoff + offsetof(struct iphdr, saddr),
- sizeof(u_int32_t) * 2, _addrs);
- if (ap == NULL)
- return false;
-
- tuple->src.u3.ip = ap[0];
- tuple->dst.u3.ip = ap[1];
-
- return true;
-}
-
-static bool ipv4_invert_tuple(struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_tuple *orig)
-{
- tuple->src.u3.ip = orig->dst.u3.ip;
- tuple->dst.u3.ip = orig->src.u3.ip;
-
- return true;
-}
-
-static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
- unsigned int *dataoff, u_int8_t *protonum)
-{
- const struct iphdr *iph;
- struct iphdr _iph;
-
- iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph);
- if (iph == NULL)
- return -NF_ACCEPT;
-
- /* Conntrack defragments packets, we might still see fragments
- * inside ICMP packets though. */
- if (iph->frag_off & htons(IP_OFFSET))
- return -NF_ACCEPT;
-
- *dataoff = nhoff + (iph->ihl << 2);
- *protonum = iph->protocol;
-
- /* Check bogus IP headers */
- if (*dataoff > skb->len) {
- pr_debug("nf_conntrack_ipv4: bogus IPv4 packet: "
- "nhoff %u, ihl %u, skblen %u\n",
- nhoff, iph->ihl << 2, skb->len);
- return -NF_ACCEPT;
- }
-
- return NF_ACCEPT;
-}
-
-static unsigned int ipv4_helper(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- struct nf_conn *ct;
- enum ip_conntrack_info ctinfo;
- const struct nf_conn_help *help;
- const struct nf_conntrack_helper *helper;
-
- /* This is where we call the helper: as the packet goes out. */
- ct = nf_ct_get(skb, &ctinfo);
- if (!ct || ctinfo == IP_CT_RELATED_REPLY)
- return NF_ACCEPT;
-
- help = nfct_help(ct);
- if (!help)
- return NF_ACCEPT;
-
- /* rcu_read_lock()ed by nf_hook_thresh */
- helper = rcu_dereference(help->helper);
- if (!helper)
- return NF_ACCEPT;
-
- return helper->help(skb, skb_network_offset(skb) + ip_hdrlen(skb),
- ct, ctinfo);
-}
-
-static unsigned int ipv4_confirm(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- struct nf_conn *ct;
- enum ip_conntrack_info ctinfo;
-
- ct = nf_ct_get(skb, &ctinfo);
- if (!ct || ctinfo == IP_CT_RELATED_REPLY)
- goto out;
-
- /* adjust seqs for loopback traffic only in outgoing direction */
- if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
- !nf_is_loopback_packet(skb)) {
- if (!nf_ct_seq_adjust(skb, ct, ctinfo, ip_hdrlen(skb))) {
- NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);
- return NF_DROP;
- }
- }
-out:
- /* We've seen it coming out the other side: confirm it */
- return nf_conntrack_confirm(skb);
-}
-
-static unsigned int ipv4_conntrack_in(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return nf_conntrack_in(state->net, PF_INET, state->hook, skb);
-}
-
-static unsigned int ipv4_conntrack_local(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- if (ip_is_fragment(ip_hdr(skb))) { /* IP_NODEFRAG setsockopt set */
- enum ip_conntrack_info ctinfo;
- struct nf_conn *tmpl;
-
- tmpl = nf_ct_get(skb, &ctinfo);
- if (tmpl && nf_ct_is_template(tmpl)) {
- /* when skipping ct, clear templates to avoid fooling
- * later targets/matches
- */
- skb->_nfct = 0;
- nf_ct_put(tmpl);
- }
- return NF_ACCEPT;
- }
-
- return nf_conntrack_in(state->net, PF_INET, state->hook, skb);
-}
-
-/* Connection tracking may drop packets, but never alters them, so
- make it the first hook. */
-static const struct nf_hook_ops ipv4_conntrack_ops[] = {
- {
- .hook = ipv4_conntrack_in,
- .pf = NFPROTO_IPV4,
- .hooknum = NF_INET_PRE_ROUTING,
- .priority = NF_IP_PRI_CONNTRACK,
- },
- {
- .hook = ipv4_conntrack_local,
- .pf = NFPROTO_IPV4,
- .hooknum = NF_INET_LOCAL_OUT,
- .priority = NF_IP_PRI_CONNTRACK,
- },
- {
- .hook = ipv4_helper,
- .pf = NFPROTO_IPV4,
- .hooknum = NF_INET_POST_ROUTING,
- .priority = NF_IP_PRI_CONNTRACK_HELPER,
- },
- {
- .hook = ipv4_confirm,
- .pf = NFPROTO_IPV4,
- .hooknum = NF_INET_POST_ROUTING,
- .priority = NF_IP_PRI_CONNTRACK_CONFIRM,
- },
- {
- .hook = ipv4_helper,
- .pf = NFPROTO_IPV4,
- .hooknum = NF_INET_LOCAL_IN,
- .priority = NF_IP_PRI_CONNTRACK_HELPER,
- },
- {
- .hook = ipv4_confirm,
- .pf = NFPROTO_IPV4,
- .hooknum = NF_INET_LOCAL_IN,
- .priority = NF_IP_PRI_CONNTRACK_CONFIRM,
- },
-};
-
-/* Fast function for those who don't want to parse /proc (and I don't
- blame them). */
-/* Reversing the socket's dst/src point of view gives us the reply
- mapping. */
-static int
-getorigdst(struct sock *sk, int optval, void __user *user, int *len)
-{
- const struct inet_sock *inet = inet_sk(sk);
- const struct nf_conntrack_tuple_hash *h;
- struct nf_conntrack_tuple tuple;
-
- memset(&tuple, 0, sizeof(tuple));
-
- lock_sock(sk);
- tuple.src.u3.ip = inet->inet_rcv_saddr;
- tuple.src.u.tcp.port = inet->inet_sport;
- tuple.dst.u3.ip = inet->inet_daddr;
- tuple.dst.u.tcp.port = inet->inet_dport;
- tuple.src.l3num = PF_INET;
- tuple.dst.protonum = sk->sk_protocol;
- release_sock(sk);
-
- /* We only do TCP and SCTP at the moment: is there a better way? */
- if (tuple.dst.protonum != IPPROTO_TCP &&
- tuple.dst.protonum != IPPROTO_SCTP) {
- pr_debug("SO_ORIGINAL_DST: Not a TCP/SCTP socket\n");
- return -ENOPROTOOPT;
- }
-
- if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
- pr_debug("SO_ORIGINAL_DST: len %d not %zu\n",
- *len, sizeof(struct sockaddr_in));
- return -EINVAL;
- }
-
- h = nf_conntrack_find_get(sock_net(sk), &nf_ct_zone_dflt, &tuple);
- if (h) {
- struct sockaddr_in sin;
- struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
-
- sin.sin_family = AF_INET;
- sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
- .tuple.dst.u.tcp.port;
- sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
- .tuple.dst.u3.ip;
- memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
-
- pr_debug("SO_ORIGINAL_DST: %pI4 %u\n",
- &sin.sin_addr.s_addr, ntohs(sin.sin_port));
- nf_ct_put(ct);
- if (copy_to_user(user, &sin, sizeof(sin)) != 0)
- return -EFAULT;
- else
- return 0;
- }
- pr_debug("SO_ORIGINAL_DST: Can't find %pI4/%u-%pI4/%u.\n",
- &tuple.src.u3.ip, ntohs(tuple.src.u.tcp.port),
- &tuple.dst.u3.ip, ntohs(tuple.dst.u.tcp.port));
- return -ENOENT;
-}
-
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
-
-#include <linux/netfilter/nfnetlink.h>
-#include <linux/netfilter/nfnetlink_conntrack.h>
-
-static int ipv4_tuple_to_nlattr(struct sk_buff *skb,
- const struct nf_conntrack_tuple *tuple)
-{
- if (nla_put_in_addr(skb, CTA_IP_V4_SRC, tuple->src.u3.ip) ||
- nla_put_in_addr(skb, CTA_IP_V4_DST, tuple->dst.u3.ip))
- goto nla_put_failure;
- return 0;
-
-nla_put_failure:
- return -1;
-}
-
-static const struct nla_policy ipv4_nla_policy[CTA_IP_MAX+1] = {
- [CTA_IP_V4_SRC] = { .type = NLA_U32 },
- [CTA_IP_V4_DST] = { .type = NLA_U32 },
-};
-
-static int ipv4_nlattr_to_tuple(struct nlattr *tb[],
- struct nf_conntrack_tuple *t)
-{
- if (!tb[CTA_IP_V4_SRC] || !tb[CTA_IP_V4_DST])
- return -EINVAL;
-
- t->src.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_SRC]);
- t->dst.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_DST]);
-
- return 0;
-}
-#endif
-
-static struct nf_sockopt_ops so_getorigdst = {
- .pf = PF_INET,
- .get_optmin = SO_ORIGINAL_DST,
- .get_optmax = SO_ORIGINAL_DST+1,
- .get = getorigdst,
- .owner = THIS_MODULE,
-};
-
-static int ipv4_hooks_register(struct net *net)
-{
- struct conntrack4_net *cnet = net_generic(net, conntrack4_net_id);
- int err = 0;
-
- mutex_lock(&register_ipv4_hooks);
-
- cnet->users++;
- if (cnet->users > 1)
- goto out_unlock;
-
- err = nf_defrag_ipv4_enable(net);
- if (err) {
- cnet->users = 0;
- goto out_unlock;
- }
-
- err = nf_register_net_hooks(net, ipv4_conntrack_ops,
- ARRAY_SIZE(ipv4_conntrack_ops));
-
- if (err)
- cnet->users = 0;
- out_unlock:
- mutex_unlock(&register_ipv4_hooks);
- return err;
-}
-
-static void ipv4_hooks_unregister(struct net *net)
-{
- struct conntrack4_net *cnet = net_generic(net, conntrack4_net_id);
-
- mutex_lock(&register_ipv4_hooks);
- if (cnet->users && (--cnet->users == 0))
- nf_unregister_net_hooks(net, ipv4_conntrack_ops,
- ARRAY_SIZE(ipv4_conntrack_ops));
- mutex_unlock(&register_ipv4_hooks);
-}
-
-const struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 = {
- .l3proto = PF_INET,
- .pkt_to_tuple = ipv4_pkt_to_tuple,
- .invert_tuple = ipv4_invert_tuple,
- .get_l4proto = ipv4_get_l4proto,
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
- .tuple_to_nlattr = ipv4_tuple_to_nlattr,
- .nlattr_to_tuple = ipv4_nlattr_to_tuple,
- .nla_policy = ipv4_nla_policy,
- .nla_size = NLA_ALIGN(NLA_HDRLEN + sizeof(u32)) + /* CTA_IP_V4_SRC */
- NLA_ALIGN(NLA_HDRLEN + sizeof(u32)), /* CTA_IP_V4_DST */
-#endif
- .net_ns_get = ipv4_hooks_register,
- .net_ns_put = ipv4_hooks_unregister,
- .me = THIS_MODULE,
-};
-
-module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint,
- &nf_conntrack_htable_size, 0600);
-
-MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET));
-MODULE_ALIAS("ip_conntrack");
-MODULE_LICENSE("GPL");
-
-static const struct nf_conntrack_l4proto * const builtin_l4proto4[] = {
- &nf_conntrack_l4proto_tcp4,
- &nf_conntrack_l4proto_udp4,
- &nf_conntrack_l4proto_icmp,
-#ifdef CONFIG_NF_CT_PROTO_DCCP
- &nf_conntrack_l4proto_dccp4,
-#endif
-#ifdef CONFIG_NF_CT_PROTO_SCTP
- &nf_conntrack_l4proto_sctp4,
-#endif
-#ifdef CONFIG_NF_CT_PROTO_UDPLITE
- &nf_conntrack_l4proto_udplite4,
-#endif
-};
-
-static int ipv4_net_init(struct net *net)
-{
- return nf_ct_l4proto_pernet_register(net, builtin_l4proto4,
- ARRAY_SIZE(builtin_l4proto4));
-}
-
-static void ipv4_net_exit(struct net *net)
-{
- nf_ct_l4proto_pernet_unregister(net, builtin_l4proto4,
- ARRAY_SIZE(builtin_l4proto4));
-}
-
-static struct pernet_operations ipv4_net_ops = {
- .init = ipv4_net_init,
- .exit = ipv4_net_exit,
- .id = &conntrack4_net_id,
- .size = sizeof(struct conntrack4_net),
-};
-
-static int __init nf_conntrack_l3proto_ipv4_init(void)
-{
- int ret = 0;
-
- need_conntrack();
-
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
- if (WARN_ON(nla_policy_len(ipv4_nla_policy, CTA_IP_MAX + 1) !=
- nf_conntrack_l3proto_ipv4.nla_size))
- return -EINVAL;
-#endif
- ret = nf_register_sockopt(&so_getorigdst);
- if (ret < 0) {
- pr_err("Unable to register netfilter socket option\n");
- return ret;
- }
-
- ret = register_pernet_subsys(&ipv4_net_ops);
- if (ret < 0) {
- pr_err("nf_conntrack_ipv4: can't register pernet ops\n");
- goto cleanup_sockopt;
- }
-
- ret = nf_ct_l4proto_register(builtin_l4proto4,
- ARRAY_SIZE(builtin_l4proto4));
- if (ret < 0)
- goto cleanup_pernet;
-
- ret = nf_ct_l3proto_register(&nf_conntrack_l3proto_ipv4);
- if (ret < 0) {
- pr_err("nf_conntrack_ipv4: can't register ipv4 proto.\n");
- goto cleanup_l4proto;
- }
-
- return ret;
-cleanup_l4proto:
- nf_ct_l4proto_unregister(builtin_l4proto4,
- ARRAY_SIZE(builtin_l4proto4));
- cleanup_pernet:
- unregister_pernet_subsys(&ipv4_net_ops);
- cleanup_sockopt:
- nf_unregister_sockopt(&so_getorigdst);
- return ret;
-}
-
-static void __exit nf_conntrack_l3proto_ipv4_fini(void)
-{
- synchronize_net();
- nf_ct_l3proto_unregister(&nf_conntrack_l3proto_ipv4);
- nf_ct_l4proto_unregister(builtin_l4proto4,
- ARRAY_SIZE(builtin_l4proto4));
- unregister_pernet_subsys(&ipv4_net_ops);
- nf_unregister_sockopt(&so_getorigdst);
-}
-
-module_init(nf_conntrack_l3proto_ipv4_init);
-module_exit(nf_conntrack_l3proto_ipv4_fini);
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
deleted file mode 100644
index 5c15beafa711..000000000000
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ /dev/null
@@ -1,383 +0,0 @@
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- * (C) 2006-2010 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/types.h>
-#include <linux/timer.h>
-#include <linux/netfilter.h>
-#include <linux/in.h>
-#include <linux/icmp.h>
-#include <linux/seq_file.h>
-#include <net/ip.h>
-#include <net/checksum.h>
-#include <linux/netfilter_ipv4.h>
-#include <net/netfilter/nf_conntrack_tuple.h>
-#include <net/netfilter/nf_conntrack_l4proto.h>
-#include <net/netfilter/nf_conntrack_core.h>
-#include <net/netfilter/nf_conntrack_zones.h>
-#include <net/netfilter/nf_log.h>
-
-static const unsigned int nf_ct_icmp_timeout = 30*HZ;
-
-static inline struct nf_icmp_net *icmp_pernet(struct net *net)
-{
- return &net->ct.nf_ct_proto.icmp;
-}
-
-static bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
- struct net *net, struct nf_conntrack_tuple *tuple)
-{
- const struct icmphdr *hp;
- struct icmphdr _hdr;
-
- hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
- if (hp == NULL)
- return false;
-
- tuple->dst.u.icmp.type = hp->type;
- tuple->src.u.icmp.id = hp->un.echo.id;
- tuple->dst.u.icmp.code = hp->code;
-
- return true;
-}
-
-/* Add 1; spaces filled with 0. */
-static const u_int8_t invmap[] = {
- [ICMP_ECHO] = ICMP_ECHOREPLY + 1,
- [ICMP_ECHOREPLY] = ICMP_ECHO + 1,
- [ICMP_TIMESTAMP] = ICMP_TIMESTAMPREPLY + 1,
- [ICMP_TIMESTAMPREPLY] = ICMP_TIMESTAMP + 1,
- [ICMP_INFO_REQUEST] = ICMP_INFO_REPLY + 1,
- [ICMP_INFO_REPLY] = ICMP_INFO_REQUEST + 1,
- [ICMP_ADDRESS] = ICMP_ADDRESSREPLY + 1,
- [ICMP_ADDRESSREPLY] = ICMP_ADDRESS + 1
-};
-
-static bool icmp_invert_tuple(struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_tuple *orig)
-{
- if (orig->dst.u.icmp.type >= sizeof(invmap) ||
- !invmap[orig->dst.u.icmp.type])
- return false;
-
- tuple->src.u.icmp.id = orig->src.u.icmp.id;
- tuple->dst.u.icmp.type = invmap[orig->dst.u.icmp.type] - 1;
- tuple->dst.u.icmp.code = orig->dst.u.icmp.code;
- return true;
-}
-
-static unsigned int *icmp_get_timeouts(struct net *net)
-{
- return &icmp_pernet(net)->timeout;
-}
-
-/* Returns verdict for packet, or -1 for invalid. */
-static int icmp_packet(struct nf_conn *ct,
- const struct sk_buff *skb,
- unsigned int dataoff,
- enum ip_conntrack_info ctinfo,
- unsigned int *timeout)
-{
- /* Do not immediately delete the connection after the first
- successful reply to avoid excessive conntrackd traffic
- and also to handle correctly ICMP echo reply duplicates. */
- nf_ct_refresh_acct(ct, ctinfo, skb, *timeout);
-
- return NF_ACCEPT;
-}
-
-/* Called when a new connection for this protocol found. */
-static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb,
- unsigned int dataoff, unsigned int *timeouts)
-{
- static const u_int8_t valid_new[] = {
- [ICMP_ECHO] = 1,
- [ICMP_TIMESTAMP] = 1,
- [ICMP_INFO_REQUEST] = 1,
- [ICMP_ADDRESS] = 1
- };
-
- if (ct->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new) ||
- !valid_new[ct->tuplehash[0].tuple.dst.u.icmp.type]) {
- /* Can't create a new ICMP `conn' with this. */
- pr_debug("icmp: can't create new conn with type %u\n",
- ct->tuplehash[0].tuple.dst.u.icmp.type);
- nf_ct_dump_tuple_ip(&ct->tuplehash[0].tuple);
- return false;
- }
- return true;
-}
-
-/* Returns conntrack if it dealt with ICMP, and filled in skb fields */
-static int
-icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
- unsigned int hooknum)
-{
- struct nf_conntrack_tuple innertuple, origtuple;
- const struct nf_conntrack_l4proto *innerproto;
- const struct nf_conntrack_tuple_hash *h;
- const struct nf_conntrack_zone *zone;
- enum ip_conntrack_info ctinfo;
- struct nf_conntrack_zone tmp;
-
- WARN_ON(skb_nfct(skb));
- zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
-
- /* Are they talking about one of our connections? */
- if (!nf_ct_get_tuplepr(skb,
- skb_network_offset(skb) + ip_hdrlen(skb)
- + sizeof(struct icmphdr),
- PF_INET, net, &origtuple)) {
- pr_debug("icmp_error_message: failed to get tuple\n");
- return -NF_ACCEPT;
- }
-
- /* rcu_read_lock()ed by nf_hook_thresh */
- innerproto = __nf_ct_l4proto_find(PF_INET, origtuple.dst.protonum);
-
- /* Ordinarily, we'd expect the inverted tupleproto, but it's
- been preserved inside the ICMP. */
- if (!nf_ct_invert_tuple(&innertuple, &origtuple,
- &nf_conntrack_l3proto_ipv4, innerproto)) {
- pr_debug("icmp_error_message: no match\n");
- return -NF_ACCEPT;
- }
-
- ctinfo = IP_CT_RELATED;
-
- h = nf_conntrack_find_get(net, zone, &innertuple);
- if (!h) {
- pr_debug("icmp_error_message: no match\n");
- return -NF_ACCEPT;
- }
-
- if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY)
- ctinfo += IP_CT_IS_REPLY;
-
- /* Update skb to refer to this connection */
- nf_ct_set(skb, nf_ct_tuplehash_to_ctrack(h), ctinfo);
- return NF_ACCEPT;
-}
-
-static void icmp_error_log(const struct sk_buff *skb, struct net *net,
- u8 pf, const char *msg)
-{
- nf_l4proto_log_invalid(skb, net, pf, IPPROTO_ICMP, "%s", msg);
-}
-
-/* Small and modified version of icmp_rcv */
-static int
-icmp_error(struct net *net, struct nf_conn *tmpl,
- struct sk_buff *skb, unsigned int dataoff,
- u8 pf, unsigned int hooknum)
-{
- const struct icmphdr *icmph;
- struct icmphdr _ih;
-
- /* Not enough header? */
- icmph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_ih), &_ih);
- if (icmph == NULL) {
- icmp_error_log(skb, net, pf, "short packet");
- return -NF_ACCEPT;
- }
-
- /* See ip_conntrack_proto_tcp.c */
- if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
- nf_ip_checksum(skb, hooknum, dataoff, 0)) {
- icmp_error_log(skb, net, pf, "bad hw icmp checksum");
- return -NF_ACCEPT;
- }
-
- /*
- * 18 is the highest 'known' ICMP type. Anything else is a mystery
- *
- * RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently
- * discarded.
- */
- if (icmph->type > NR_ICMP_TYPES) {
- icmp_error_log(skb, net, pf, "invalid icmp type");
- return -NF_ACCEPT;
- }
-
- /* Need to track icmp error message? */
- if (icmph->type != ICMP_DEST_UNREACH &&
- icmph->type != ICMP_SOURCE_QUENCH &&
- icmph->type != ICMP_TIME_EXCEEDED &&
- icmph->type != ICMP_PARAMETERPROB &&
- icmph->type != ICMP_REDIRECT)
- return NF_ACCEPT;
-
- return icmp_error_message(net, tmpl, skb, hooknum);
-}
-
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
-
-#include <linux/netfilter/nfnetlink.h>
-#include <linux/netfilter/nfnetlink_conntrack.h>
-
-static int icmp_tuple_to_nlattr(struct sk_buff *skb,
- const struct nf_conntrack_tuple *t)
-{
- if (nla_put_be16(skb, CTA_PROTO_ICMP_ID, t->src.u.icmp.id) ||
- nla_put_u8(skb, CTA_PROTO_ICMP_TYPE, t->dst.u.icmp.type) ||
- nla_put_u8(skb, CTA_PROTO_ICMP_CODE, t->dst.u.icmp.code))
- goto nla_put_failure;
- return 0;
-
-nla_put_failure:
- return -1;
-}
-
-static const struct nla_policy icmp_nla_policy[CTA_PROTO_MAX+1] = {
- [CTA_PROTO_ICMP_TYPE] = { .type = NLA_U8 },
- [CTA_PROTO_ICMP_CODE] = { .type = NLA_U8 },
- [CTA_PROTO_ICMP_ID] = { .type = NLA_U16 },
-};
-
-static int icmp_nlattr_to_tuple(struct nlattr *tb[],
- struct nf_conntrack_tuple *tuple)
-{
- if (!tb[CTA_PROTO_ICMP_TYPE] ||
- !tb[CTA_PROTO_ICMP_CODE] ||
- !tb[CTA_PROTO_ICMP_ID])
- return -EINVAL;
-
- tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMP_TYPE]);
- tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMP_CODE]);
- tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMP_ID]);
-
- if (tuple->dst.u.icmp.type >= sizeof(invmap) ||
- !invmap[tuple->dst.u.icmp.type])
- return -EINVAL;
-
- return 0;
-}
-
-static unsigned int icmp_nlattr_tuple_size(void)
-{
- static unsigned int size __read_mostly;
-
- if (!size)
- size = nla_policy_len(icmp_nla_policy, CTA_PROTO_MAX + 1);
-
- return size;
-}
-#endif
-
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
-
-#include <linux/netfilter/nfnetlink.h>
-#include <linux/netfilter/nfnetlink_cttimeout.h>
-
-static int icmp_timeout_nlattr_to_obj(struct nlattr *tb[],
- struct net *net, void *data)
-{
- unsigned int *timeout = data;
- struct nf_icmp_net *in = icmp_pernet(net);
-
- if (tb[CTA_TIMEOUT_ICMP_TIMEOUT]) {
- *timeout =
- ntohl(nla_get_be32(tb[CTA_TIMEOUT_ICMP_TIMEOUT])) * HZ;
- } else {
- /* Set default ICMP timeout. */
- *timeout = in->timeout;
- }
- return 0;
-}
-
-static int
-icmp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data)
-{
- const unsigned int *timeout = data;
-
- if (nla_put_be32(skb, CTA_TIMEOUT_ICMP_TIMEOUT, htonl(*timeout / HZ)))
- goto nla_put_failure;
- return 0;
-
-nla_put_failure:
- return -ENOSPC;
-}
-
-static const struct nla_policy
-icmp_timeout_nla_policy[CTA_TIMEOUT_ICMP_MAX+1] = {
- [CTA_TIMEOUT_ICMP_TIMEOUT] = { .type = NLA_U32 },
-};
-#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
-
-#ifdef CONFIG_SYSCTL
-static struct ctl_table icmp_sysctl_table[] = {
- {
- .procname = "nf_conntrack_icmp_timeout",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- { }
-};
-#endif /* CONFIG_SYSCTL */
-
-static int icmp_kmemdup_sysctl_table(struct nf_proto_net *pn,
- struct nf_icmp_net *in)
-{
-#ifdef CONFIG_SYSCTL
- pn->ctl_table = kmemdup(icmp_sysctl_table,
- sizeof(icmp_sysctl_table),
- GFP_KERNEL);
- if (!pn->ctl_table)
- return -ENOMEM;
-
- pn->ctl_table[0].data = &in->timeout;
-#endif
- return 0;
-}
-
-static int icmp_init_net(struct net *net, u_int16_t proto)
-{
- struct nf_icmp_net *in = icmp_pernet(net);
- struct nf_proto_net *pn = &in->pn;
-
- in->timeout = nf_ct_icmp_timeout;
-
- return icmp_kmemdup_sysctl_table(pn, in);
-}
-
-static struct nf_proto_net *icmp_get_net_proto(struct net *net)
-{
- return &net->ct.nf_ct_proto.icmp.pn;
-}
-
-const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp =
-{
- .l3proto = PF_INET,
- .l4proto = IPPROTO_ICMP,
- .pkt_to_tuple = icmp_pkt_to_tuple,
- .invert_tuple = icmp_invert_tuple,
- .packet = icmp_packet,
- .get_timeouts = icmp_get_timeouts,
- .new = icmp_new,
- .error = icmp_error,
- .destroy = NULL,
- .me = NULL,
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
- .tuple_to_nlattr = icmp_tuple_to_nlattr,
- .nlattr_tuple_size = icmp_nlattr_tuple_size,
- .nlattr_to_tuple = icmp_nlattr_to_tuple,
- .nla_policy = icmp_nla_policy,
-#endif
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
- .ctnl_timeout = {
- .nlattr_to_obj = icmp_timeout_nlattr_to_obj,
- .obj_to_nlattr = icmp_timeout_obj_to_nlattr,
- .nlattr_max = CTA_TIMEOUT_ICMP_MAX,
- .obj_size = sizeof(unsigned int),
- .nla_policy = icmp_timeout_nla_policy,
- },
-#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
- .init_net = icmp_init_net,
- .get_net_proto = icmp_get_net_proto,
-};
diff --git a/net/ipv4/netfilter/nf_tproxy_ipv4.c b/net/ipv4/netfilter/nf_tproxy_ipv4.c
index 805e83ec3ad9..164714104965 100644
--- a/net/ipv4/netfilter/nf_tproxy_ipv4.c
+++ b/net/ipv4/netfilter/nf_tproxy_ipv4.c
@@ -37,7 +37,7 @@ nf_tproxy_handle_time_wait4(struct net *net, struct sk_buff *skb,
* to a listener socket if there's one */
struct sock *sk2;
- sk2 = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol,
+ sk2 = nf_tproxy_get_sock_v4(net, skb, iph->protocol,
iph->saddr, laddr ? laddr : iph->daddr,
hp->source, lport ? lport : hp->dest,
skb->dev, NF_TPROXY_LOOKUP_LISTENER);
@@ -71,7 +71,7 @@ __be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr)
EXPORT_SYMBOL_GPL(nf_tproxy_laddr4);
struct sock *
-nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp,
+nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb,
const u8 protocol,
const __be32 saddr, const __be32 daddr,
const __be16 sport, const __be16 dport,
@@ -79,16 +79,21 @@ nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp,
const enum nf_tproxy_lookup_t lookup_type)
{
struct sock *sk;
- struct tcphdr *tcph;
switch (protocol) {
- case IPPROTO_TCP:
+ case IPPROTO_TCP: {
+ struct tcphdr _hdr, *hp;
+
+ hp = skb_header_pointer(skb, ip_hdrlen(skb),
+ sizeof(struct tcphdr), &_hdr);
+ if (hp == NULL)
+ return NULL;
+
switch (lookup_type) {
case NF_TPROXY_LOOKUP_LISTENER:
- tcph = hp;
sk = inet_lookup_listener(net, &tcp_hashinfo, skb,
ip_hdrlen(skb) +
- __tcp_hdrlen(tcph),
+ __tcp_hdrlen(hp),
saddr, sport,
daddr, dport,
in->ifindex, 0);
@@ -110,6 +115,7 @@ nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp,
BUG();
}
break;
+ }
case IPPROTO_UDP:
sk = udp4_lib_lookup(net, saddr, sport, daddr, dport,
in->ifindex);
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index b54c964ad925..8d7aaf118a30 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -320,8 +320,7 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk,
if (addr->sin_addr.s_addr == htonl(INADDR_ANY))
chk_addr_ret = RTN_LOCAL;
- if ((net->ipv4.sysctl_ip_nonlocal_bind == 0 &&
- isk->freebind == 0 && isk->transparent == 0 &&
+ if ((!inet_can_nonlocal_bind(net, isk) &&
chk_addr_ret != RTN_LOCAL) ||
chk_addr_ret == RTN_MULTICAST ||
chk_addr_ret == RTN_BROADCAST)
@@ -361,8 +360,7 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk,
scoped);
rcu_read_unlock();
- if (!(net->ipv6.sysctl.ip_nonlocal_bind ||
- isk->freebind || isk->transparent || has_addr ||
+ if (!(ipv6_can_nonlocal_bind(net, isk) || has_addr ||
addr_type == IPV6_ADDR_ANY))
return -EADDRNOTAVAIL;
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index b46e4cf9a55a..70289682a670 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -119,6 +119,7 @@ static const struct snmp_mib snmp4_ipextstats_list[] = {
SNMP_MIB_ITEM("InECT1Pkts", IPSTATS_MIB_ECT1PKTS),
SNMP_MIB_ITEM("InECT0Pkts", IPSTATS_MIB_ECT0PKTS),
SNMP_MIB_ITEM("InCEPkts", IPSTATS_MIB_CEPKTS),
+ SNMP_MIB_ITEM("ReasmOverlaps", IPSTATS_MIB_REASM_OVERLAPS),
SNMP_MIB_SENTINEL
};
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 1df6e97106d7..b678466da451 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1996,8 +1996,11 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
goto no_route;
}
- if (res->type == RTN_BROADCAST)
+ if (res->type == RTN_BROADCAST) {
+ if (IN_DEV_BFORWARD(in_dev))
+ goto make_route;
goto brd_input;
+ }
if (res->type == RTN_LOCAL) {
err = fib_validate_source(skb, saddr, daddr, tos,
@@ -2014,6 +2017,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
if (res->type != RTN_UNICAST)
goto martian_destination;
+make_route:
err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
out: return err;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index af0a857d8352..b92f422f2fa8 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -189,8 +189,9 @@ static int ipv4_ping_group_range(struct ctl_table *table, int write,
if (write && ret == 0) {
low = make_kgid(user_ns, urange[0]);
high = make_kgid(user_ns, urange[1]);
- if (!gid_valid(low) || !gid_valid(high) ||
- (urange[1] < urange[0]) || gid_lt(high, low)) {
+ if (!gid_valid(low) || !gid_valid(high))
+ return -EINVAL;
+ if (urange[1] < urange[0] || gid_lt(high, low)) {
low = make_kgid(&init_user_ns, 1);
high = make_kgid(&init_user_ns, 0);
}
@@ -200,6 +201,23 @@ static int ipv4_ping_group_range(struct ctl_table *table, int write,
return ret;
}
+static int ipv4_fwd_update_priority(struct ctl_table *table, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ struct net *net;
+ int ret;
+
+ net = container_of(table->data, struct net,
+ ipv4.sysctl_ip_fwd_update_priority);
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ if (write && ret == 0)
+ call_netevent_notifiers(NETEVENT_IPV4_FWD_UPDATE_PRIORITY_UPDATE,
+ net);
+
+ return ret;
+}
+
static int proc_tcp_congestion_control(struct ctl_table *ctl, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
@@ -663,6 +681,15 @@ static struct ctl_table ipv4_net_table[] = {
.proc_handler = proc_dointvec,
},
{
+ .procname = "ip_forward_update_priority",
+ .data = &init_net.ipv4.sysctl_ip_fwd_update_priority,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = ipv4_fwd_update_priority,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+ {
.procname = "ip_nonlocal_bind",
.data = &init_net.ipv4.sysctl_ip_nonlocal_bind,
.maxlen = sizeof(int),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index e3704a49164b..b8af2fec5ad5 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -507,7 +507,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
const struct tcp_sock *tp = tcp_sk(sk);
int state;
- sock_poll_wait(file, sk_sleep(sk), wait);
+ sock_poll_wait(file, wait);
state = inet_sk_state_load(sk);
if (state == TCP_LISTEN)
@@ -1994,7 +1994,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
* shouldn't happen.
*/
if (WARN(before(*seq, TCP_SKB_CB(skb)->seq),
- "recvmsg bug: copied %X seq %X rcvnxt %X fl %X\n",
+ "TCP recvmsg seq # bug: copied %X, seq %X, rcvnxt %X, fl %X\n",
*seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt,
flags))
break;
@@ -2009,7 +2009,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
goto found_fin_ok;
WARN(!(flags & MSG_PEEK),
- "recvmsg bug 2: copied %X seq %X rcvnxt %X fl %X\n",
+ "TCP recvmsg seq # bug 2: copied %X, seq %X, rcvnxt %X, fl %X\n",
*seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags);
}
@@ -2531,7 +2531,6 @@ int tcp_disconnect(struct sock *sk, int flags)
struct inet_sock *inet = inet_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
- int err = 0;
int old_state = sk->sk_state;
if (old_state != TCP_CLOSE)
@@ -2555,6 +2554,8 @@ int tcp_disconnect(struct sock *sk, int flags)
tcp_clear_xmit_timers(sk);
__skb_queue_purge(&sk->sk_receive_queue);
+ tp->copied_seq = tp->rcv_nxt;
+ tp->urg_data = 0;
tcp_write_queue_purge(sk);
tcp_fastopen_active_disable_ofo_check(sk);
skb_rbtree_purge(&tp->out_of_order_queue);
@@ -2592,6 +2593,10 @@ int tcp_disconnect(struct sock *sk, int flags)
sk->sk_rx_dst = NULL;
tcp_saved_syn_free(tp);
tp->compressed_ack = 0;
+ tp->bytes_sent = 0;
+ tp->bytes_retrans = 0;
+ tp->dsack_dups = 0;
+ tp->reord_seen = 0;
/* Clean up fastopen related fields */
tcp_free_fastopen_req(tp);
@@ -2606,7 +2611,7 @@ int tcp_disconnect(struct sock *sk, int flags)
}
sk->sk_error_report(sk);
- return err;
+ return 0;
}
EXPORT_SYMBOL(tcp_disconnect);
@@ -2815,14 +2820,17 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
case TCP_REPAIR:
if (!tcp_can_repair_sock(sk))
err = -EPERM;
- else if (val == 1) {
+ else if (val == TCP_REPAIR_ON) {
tp->repair = 1;
sk->sk_reuse = SK_FORCE_REUSE;
tp->repair_queue = TCP_NO_QUEUE;
- } else if (val == 0) {
+ } else if (val == TCP_REPAIR_OFF) {
tp->repair = 0;
sk->sk_reuse = SK_NO_REUSE;
tcp_send_window_probe(sk);
+ } else if (val == TCP_REPAIR_OFF_NO_WP) {
+ tp->repair = 0;
+ sk->sk_reuse = SK_NO_REUSE;
} else
err = -EINVAL;
@@ -2984,7 +2992,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
if (val < 0)
err = -EINVAL;
else
- icsk->icsk_user_timeout = msecs_to_jiffies(val);
+ icsk->icsk_user_timeout = val;
break;
case TCP_FASTOPEN:
@@ -3196,10 +3204,41 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_delivery_rate = rate64;
info->tcpi_delivered = tp->delivered;
info->tcpi_delivered_ce = tp->delivered_ce;
+ info->tcpi_bytes_sent = tp->bytes_sent;
+ info->tcpi_bytes_retrans = tp->bytes_retrans;
+ info->tcpi_dsack_dups = tp->dsack_dups;
+ info->tcpi_reord_seen = tp->reord_seen;
unlock_sock_fast(sk, slow);
}
EXPORT_SYMBOL_GPL(tcp_get_info);
+static size_t tcp_opt_stats_get_size(void)
+{
+ return
+ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BUSY */
+ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_RWND_LIMITED */
+ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_SNDBUF_LIMITED */
+ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_DATA_SEGS_OUT */
+ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_TOTAL_RETRANS */
+ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_PACING_RATE */
+ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_DELIVERY_RATE */
+ nla_total_size(sizeof(u32)) + /* TCP_NLA_SND_CWND */
+ nla_total_size(sizeof(u32)) + /* TCP_NLA_REORDERING */
+ nla_total_size(sizeof(u32)) + /* TCP_NLA_MIN_RTT */
+ nla_total_size(sizeof(u8)) + /* TCP_NLA_RECUR_RETRANS */
+ nla_total_size(sizeof(u8)) + /* TCP_NLA_DELIVERY_RATE_APP_LMT */
+ nla_total_size(sizeof(u32)) + /* TCP_NLA_SNDQ_SIZE */
+ nla_total_size(sizeof(u8)) + /* TCP_NLA_CA_STATE */
+ nla_total_size(sizeof(u32)) + /* TCP_NLA_SND_SSTHRESH */
+ nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED */
+ nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED_CE */
+ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_SENT */
+ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_RETRANS */
+ nla_total_size(sizeof(u32)) + /* TCP_NLA_DSACK_DUPS */
+ nla_total_size(sizeof(u32)) + /* TCP_NLA_REORD_SEEN */
+ 0;
+}
+
struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
@@ -3208,9 +3247,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
u64 rate64;
u32 rate;
- stats = alloc_skb(7 * nla_total_size_64bit(sizeof(u64)) +
- 7 * nla_total_size(sizeof(u32)) +
- 3 * nla_total_size(sizeof(u8)), GFP_ATOMIC);
+ stats = alloc_skb(tcp_opt_stats_get_size(), GFP_ATOMIC);
if (!stats)
return NULL;
@@ -3246,6 +3283,13 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una);
nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state);
+ nla_put_u64_64bit(stats, TCP_NLA_BYTES_SENT, tp->bytes_sent,
+ TCP_NLA_PAD);
+ nla_put_u64_64bit(stats, TCP_NLA_BYTES_RETRANS, tp->bytes_retrans,
+ TCP_NLA_PAD);
+ nla_put_u32(stats, TCP_NLA_DSACK_DUPS, tp->dsack_dups);
+ nla_put_u32(stats, TCP_NLA_REORD_SEEN, tp->reord_seen);
+
return stats;
}
@@ -3440,7 +3484,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
break;
case TCP_USER_TIMEOUT:
- val = jiffies_to_msecs(icsk->icsk_user_timeout);
+ val = icsk->icsk_user_timeout;
break;
case TCP_FASTOPEN:
@@ -3714,8 +3758,7 @@ int tcp_abort(struct sock *sk, int err)
struct request_sock *req = inet_reqsk(sk);
local_bh_disable();
- inet_csk_reqsk_queue_drop_and_put(req->rsk_listener,
- req);
+ inet_csk_reqsk_queue_drop(req->rsk_listener, req);
local_bh_enable();
return 0;
}
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index 3b5f45b9e81e..13d34427ca3d 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -358,6 +358,10 @@ static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain)
/* Reduce delayed ACKs by rounding up cwnd to the next even number. */
cwnd = (cwnd + 1) & ~1U;
+ /* Ensure gain cycling gets inflight above BDP even for small BDPs. */
+ if (bbr->mode == BBR_PROBE_BW && gain > BBR_UNIT)
+ cwnd += 2;
+
return cwnd;
}
diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
index 5f5e5936760e..8b637f9f23a2 100644
--- a/net/ipv4/tcp_dctcp.c
+++ b/net/ipv4/tcp_dctcp.c
@@ -55,7 +55,6 @@ struct dctcp {
u32 dctcp_alpha;
u32 next_seq;
u32 ce_state;
- u32 delayed_ack_reserved;
u32 loss_cwnd;
};
@@ -96,7 +95,6 @@ static void dctcp_init(struct sock *sk)
ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA);
- ca->delayed_ack_reserved = 0;
ca->loss_cwnd = 0;
ca->ce_state = 0;
@@ -131,23 +129,14 @@ static void dctcp_ce_state_0_to_1(struct sock *sk)
struct dctcp *ca = inet_csk_ca(sk);
struct tcp_sock *tp = tcp_sk(sk);
- /* State has changed from CE=0 to CE=1 and delayed
- * ACK has not sent yet.
- */
- if (!ca->ce_state && ca->delayed_ack_reserved) {
- u32 tmp_rcv_nxt;
-
- /* Save current rcv_nxt. */
- tmp_rcv_nxt = tp->rcv_nxt;
-
- /* Generate previous ack with CE=0. */
- tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
- tp->rcv_nxt = ca->prior_rcv_nxt;
-
- tcp_send_ack(sk);
-
- /* Recover current rcv_nxt. */
- tp->rcv_nxt = tmp_rcv_nxt;
+ if (!ca->ce_state) {
+ /* State has changed from CE=0 to CE=1, force an immediate
+ * ACK to reflect the new CE state. If an ACK was delayed,
+ * send that first to reflect the prior CE state.
+ */
+ if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER)
+ __tcp_send_ack(sk, ca->prior_rcv_nxt);
+ tcp_enter_quickack_mode(sk, 1);
}
ca->prior_rcv_nxt = tp->rcv_nxt;
@@ -161,23 +150,14 @@ static void dctcp_ce_state_1_to_0(struct sock *sk)
struct dctcp *ca = inet_csk_ca(sk);
struct tcp_sock *tp = tcp_sk(sk);
- /* State has changed from CE=1 to CE=0 and delayed
- * ACK has not sent yet.
- */
- if (ca->ce_state && ca->delayed_ack_reserved) {
- u32 tmp_rcv_nxt;
-
- /* Save current rcv_nxt. */
- tmp_rcv_nxt = tp->rcv_nxt;
-
- /* Generate previous ack with CE=1. */
- tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
- tp->rcv_nxt = ca->prior_rcv_nxt;
-
- tcp_send_ack(sk);
-
- /* Recover current rcv_nxt. */
- tp->rcv_nxt = tmp_rcv_nxt;
+ if (ca->ce_state) {
+ /* State has changed from CE=1 to CE=0, force an immediate
+ * ACK to reflect the new CE state. If an ACK was delayed,
+ * send that first to reflect the prior CE state.
+ */
+ if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER)
+ __tcp_send_ack(sk, ca->prior_rcv_nxt);
+ tcp_enter_quickack_mode(sk, 1);
}
ca->prior_rcv_nxt = tp->rcv_nxt;
@@ -248,25 +228,6 @@ static void dctcp_state(struct sock *sk, u8 new_state)
}
}
-static void dctcp_update_ack_reserved(struct sock *sk, enum tcp_ca_event ev)
-{
- struct dctcp *ca = inet_csk_ca(sk);
-
- switch (ev) {
- case CA_EVENT_DELAYED_ACK:
- if (!ca->delayed_ack_reserved)
- ca->delayed_ack_reserved = 1;
- break;
- case CA_EVENT_NON_DELAYED_ACK:
- if (ca->delayed_ack_reserved)
- ca->delayed_ack_reserved = 0;
- break;
- default:
- /* Don't care for the rest. */
- break;
- }
-}
-
static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev)
{
switch (ev) {
@@ -276,10 +237,6 @@ static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev)
case CA_EVENT_ECN_NO_CE:
dctcp_ce_state_1_to_0(sk);
break;
- case CA_EVENT_DELAYED_ACK:
- case CA_EVENT_NON_DELAYED_ACK:
- dctcp_update_ack_reserved(sk, ev);
- break;
default:
/* Don't care for the rest. */
break;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d3b6390ecf23..715d541b52dd 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -216,7 +216,7 @@ static void tcp_incr_quickack(struct sock *sk, unsigned int max_quickacks)
icsk->icsk_ack.quick = quickacks;
}
-static void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks)
+void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks)
{
struct inet_connection_sock *icsk = inet_csk(sk);
@@ -224,6 +224,7 @@ static void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks)
icsk->icsk_ack.pingpong = 0;
icsk->icsk_ack.ato = TCP_ATO_MIN;
}
+EXPORT_SYMBOL(tcp_enter_quickack_mode);
/* Send ACKs quickly, if "quick" count is not exhausted
* and the session is not interactive.
@@ -246,8 +247,15 @@ static void tcp_ecn_queue_cwr(struct tcp_sock *tp)
static void tcp_ecn_accept_cwr(struct tcp_sock *tp, const struct sk_buff *skb)
{
- if (tcp_hdr(skb)->cwr)
+ if (tcp_hdr(skb)->cwr) {
tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
+
+ /* If the sender is telling us it has entered CWR, then its
+ * cwnd may be very low (even just 1 packet), so we should ACK
+ * immediately.
+ */
+ tcp_enter_quickack_mode((struct sock *)tp, 2);
+ }
}
static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp)
@@ -873,6 +881,7 @@ static void tcp_dsack_seen(struct tcp_sock *tp)
{
tp->rx_opt.sack_ok |= TCP_DSACK_SEEN;
tp->rack.dsack_seen = 1;
+ tp->dsack_dups++;
}
/* It's reordering when higher sequence was delivered (i.e. sacked) before
@@ -904,8 +913,8 @@ static void tcp_check_sack_reordering(struct sock *sk, const u32 low_seq,
sock_net(sk)->ipv4.sysctl_tcp_max_reordering);
}
- tp->rack.reord = 1;
/* This exciting event is worth to be remembered. 8) */
+ tp->reord_seen++;
NET_INC_STATS(sock_net(sk),
ts ? LINUX_MIB_TCPTSREORDER : LINUX_MIB_TCPSACKREORDER);
}
@@ -1869,6 +1878,7 @@ static void tcp_check_reno_reordering(struct sock *sk, const int addend)
tp->reordering = min_t(u32, tp->packets_out + addend,
sock_net(sk)->ipv4.sysctl_tcp_max_reordering);
+ tp->reord_seen++;
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRENOREORDER);
}
@@ -4343,6 +4353,11 @@ static bool tcp_try_coalesce(struct sock *sk,
if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq)
return false;
+#ifdef CONFIG_TLS_DEVICE
+ if (from->decrypted != to->decrypted)
+ return false;
+#endif
+
if (!skb_try_coalesce(to, from, fragstolen, &delta))
return false;
@@ -4361,6 +4376,23 @@ static bool tcp_try_coalesce(struct sock *sk,
return true;
}
+static bool tcp_ooo_try_coalesce(struct sock *sk,
+ struct sk_buff *to,
+ struct sk_buff *from,
+ bool *fragstolen)
+{
+ bool res = tcp_try_coalesce(sk, to, from, fragstolen);
+
+ /* In case tcp_drop() is called later, update to->gso_segs */
+ if (res) {
+ u32 gso_segs = max_t(u16, 1, skb_shinfo(to)->gso_segs) +
+ max_t(u16, 1, skb_shinfo(from)->gso_segs);
+
+ skb_shinfo(to)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
+ }
+ return res;
+}
+
static void tcp_drop(struct sock *sk, struct sk_buff *skb)
{
sk_drops_add(sk, skb);
@@ -4484,8 +4516,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
/* In the typical case, we are adding an skb to the end of the list.
* Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup.
*/
- if (tcp_try_coalesce(sk, tp->ooo_last_skb,
- skb, &fragstolen)) {
+ if (tcp_ooo_try_coalesce(sk, tp->ooo_last_skb,
+ skb, &fragstolen)) {
coalesce_done:
tcp_grow_window(sk, skb);
kfree_skb_partial(skb, fragstolen);
@@ -4513,7 +4545,7 @@ coalesce_done:
/* All the bits are present. Drop. */
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPOFOMERGE);
- __kfree_skb(skb);
+ tcp_drop(sk, skb);
skb = NULL;
tcp_dsack_set(sk, seq, end_seq);
goto add_sack;
@@ -4532,11 +4564,11 @@ coalesce_done:
TCP_SKB_CB(skb1)->end_seq);
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPOFOMERGE);
- __kfree_skb(skb1);
+ tcp_drop(sk, skb1);
goto merge_right;
}
- } else if (tcp_try_coalesce(sk, skb1,
- skb, &fragstolen)) {
+ } else if (tcp_ooo_try_coalesce(sk, skb1,
+ skb, &fragstolen)) {
goto coalesce_done;
}
p = &parent->rb_right;
@@ -4695,7 +4727,6 @@ queue_and_out:
}
eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
- tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
if (skb->len)
tcp_event_data_recv(sk, skb);
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
@@ -4872,6 +4903,9 @@ restart:
break;
memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
+#ifdef CONFIG_TLS_DEVICE
+ nskb->decrypted = skb->decrypted;
+#endif
TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
if (list)
__skb_queue_before(list, skb, nskb);
@@ -4899,6 +4933,10 @@ restart:
skb == tail ||
(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
goto end;
+#ifdef CONFIG_TLS_DEVICE
+ if (skb->decrypted != nskb->decrypted)
+ goto end;
+#endif
}
}
}
@@ -4913,6 +4951,7 @@ end:
static void tcp_collapse_ofo_queue(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
+ u32 range_truesize, sum_tiny = 0;
struct sk_buff *skb, *head;
u32 start, end;
@@ -4924,6 +4963,7 @@ new_range:
}
start = TCP_SKB_CB(skb)->seq;
end = TCP_SKB_CB(skb)->end_seq;
+ range_truesize = skb->truesize;
for (head = skb;;) {
skb = skb_rb_next(skb);
@@ -4934,11 +4974,20 @@ new_range:
if (!skb ||
after(TCP_SKB_CB(skb)->seq, end) ||
before(TCP_SKB_CB(skb)->end_seq, start)) {
- tcp_collapse(sk, NULL, &tp->out_of_order_queue,
- head, skb, start, end);
+ /* Do not attempt collapsing tiny skbs */
+ if (range_truesize != head->truesize ||
+ end - start >= SKB_WITH_OVERHEAD(SK_MEM_QUANTUM)) {
+ tcp_collapse(sk, NULL, &tp->out_of_order_queue,
+ head, skb, start, end);
+ } else {
+ sum_tiny += range_truesize;
+ if (sum_tiny > sk->sk_rcvbuf >> 3)
+ return;
+ }
goto new_range;
}
+ range_truesize += skb->truesize;
if (unlikely(before(TCP_SKB_CB(skb)->seq, start)))
start = TCP_SKB_CB(skb)->seq;
if (after(TCP_SKB_CB(skb)->end_seq, end))
@@ -4953,6 +5002,7 @@ new_range:
* 2) not add too big latencies if thousands of packets sit there.
* (But if application shrinks SO_RCVBUF, we could still end up
* freeing whole queue here)
+ * 3) Drop at least 12.5 % of sk_rcvbuf to avoid malicious attacks.
*
* Return true if queue has shrunk.
*/
@@ -4960,20 +5010,26 @@ static bool tcp_prune_ofo_queue(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct rb_node *node, *prev;
+ int goal;
if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
return false;
NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
+ goal = sk->sk_rcvbuf >> 3;
node = &tp->ooo_last_skb->rbnode;
do {
prev = rb_prev(node);
rb_erase(node, &tp->out_of_order_queue);
+ goal -= rb_to_skb(node)->truesize;
tcp_drop(sk, rb_to_skb(node));
- sk_mem_reclaim(sk);
- if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
- !tcp_under_memory_pressure(sk))
- break;
+ if (!prev || goal <= 0) {
+ sk_mem_reclaim(sk);
+ if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
+ !tcp_under_memory_pressure(sk))
+ break;
+ goal = sk->sk_rcvbuf >> 3;
+ }
node = prev;
} while (node);
tp->ooo_last_skb = rb_to_skb(prev);
@@ -5008,6 +5064,9 @@ static int tcp_prune_queue(struct sock *sk)
else if (tcp_under_memory_pressure(sk))
tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
+ if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
+ return 0;
+
tcp_collapse_ofo_queue(sk);
if (!skb_queue_empty(&sk->sk_receive_queue))
tcp_collapse(sk, &sk->sk_receive_queue, NULL,
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index dc415c66a33a..9e041fa5c545 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -157,11 +157,24 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
if (tcptw->tw_ts_recent_stamp &&
(!twp || (reuse && time_after32(ktime_get_seconds(),
tcptw->tw_ts_recent_stamp)))) {
- tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
- if (tp->write_seq == 0)
- tp->write_seq = 1;
- tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
- tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
+ /* In case of repair and re-using TIME-WAIT sockets we still
+ * want to be sure that it is safe as above but honor the
+ * sequence numbers and time stamps set as part of the repair
+ * process.
+ *
+ * Without this check re-using a TIME-WAIT socket with TCP
+ * repair would accumulate a -1 on the repair assigned
+ * sequence number. The first time it is reused the sequence
+ * is -1, the second time -2, etc. This fixes that issue
+ * without appearing to create any others.
+ */
+ if (likely(!tp->repair)) {
+ tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
+ if (tp->write_seq == 0)
+ tp->write_seq = 1;
+ tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
+ tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
+ }
sock_hold(sktw);
return 1;
}
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index f5aee641f825..870b0a335061 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -262,6 +262,9 @@ found:
flush |= (len - 1) >= mss;
flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq);
+#ifdef CONFIG_TLS_DEVICE
+ flush |= p->decrypted ^ skb->decrypted;
+#endif
if (flush || skb_gro_receive(p, skb)) {
mss = 1;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index f8f6129160dd..597dbd749f05 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -160,7 +160,8 @@ static void tcp_event_data_sent(struct tcp_sock *tp,
}
/* Account for an ACK we sent. */
-static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
+static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts,
+ u32 rcv_nxt)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -171,6 +172,9 @@ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1)
__sock_put(sk);
}
+
+ if (unlikely(rcv_nxt != tp->rcv_nxt))
+ return; /* Special ACK sent by DCTCP to reflect ECN */
tcp_dec_quickack_mode(sk, pkts);
inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
}
@@ -1009,8 +1013,8 @@ static void tcp_update_skb_after_send(struct tcp_sock *tp, struct sk_buff *skb)
* We are working here with either a clone of the original
* SKB, or a fresh unique copy made by the retransmit engine.
*/
-static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
- gfp_t gfp_mask)
+static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
+ int clone_it, gfp_t gfp_mask, u32 rcv_nxt)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
struct inet_sock *inet;
@@ -1086,7 +1090,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
th->source = inet->inet_sport;
th->dest = inet->inet_dport;
th->seq = htonl(tcb->seq);
- th->ack_seq = htonl(tp->rcv_nxt);
+ th->ack_seq = htonl(rcv_nxt);
*(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) |
tcb->tcp_flags);
@@ -1127,11 +1131,12 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
icsk->icsk_af_ops->send_check(sk, skb);
if (likely(tcb->tcp_flags & TCPHDR_ACK))
- tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
+ tcp_event_ack_sent(sk, tcp_skb_pcount(skb), rcv_nxt);
if (skb->len != tcp_header_size) {
tcp_event_data_sent(tp, sk);
tp->data_segs_out += tcp_skb_pcount(skb);
+ tp->bytes_sent += skb->len - tcp_header_size;
tcp_internal_pacing(sk, skb);
}
@@ -1164,6 +1169,13 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
return err;
}
+static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
+ gfp_t gfp_mask)
+{
+ return __tcp_transmit_skb(sk, skb, clone_it, gfp_mask,
+ tcp_sk(sk)->rcv_nxt);
+}
+
/* This routine just queues the buffer for sending.
*
* NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
@@ -2686,9 +2698,8 @@ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *next_skb = skb_rb_next(skb);
- int skb_size, next_skb_size;
+ int next_skb_size;
- skb_size = skb->len;
next_skb_size = next_skb->len;
BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1);
@@ -2859,6 +2870,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
tp->total_retrans += segs;
+ tp->bytes_retrans += skb->len;
/* make sure skb->data is aligned on arches that require it
* and check if ack-trimming & collapsing extended the headroom
@@ -3509,8 +3521,6 @@ void tcp_send_delayed_ack(struct sock *sk)
int ato = icsk->icsk_ack.ato;
unsigned long timeout;
- tcp_ca_event(sk, CA_EVENT_DELAYED_ACK);
-
if (ato > TCP_DELACK_MIN) {
const struct tcp_sock *tp = tcp_sk(sk);
int max_ato = HZ / 2;
@@ -3559,7 +3569,7 @@ void tcp_send_delayed_ack(struct sock *sk)
}
/* This routine sends an ack and also updates the window. */
-void tcp_send_ack(struct sock *sk)
+void __tcp_send_ack(struct sock *sk, u32 rcv_nxt)
{
struct sk_buff *buff;
@@ -3567,8 +3577,6 @@ void tcp_send_ack(struct sock *sk)
if (sk->sk_state == TCP_CLOSE)
return;
- tcp_ca_event(sk, CA_EVENT_NON_DELAYED_ACK);
-
/* We are not putting this on the write queue, so
* tcp_transmit_skb() will set the ownership to this
* sock.
@@ -3594,9 +3602,14 @@ void tcp_send_ack(struct sock *sk)
skb_set_tcp_pure_ack(buff);
/* Send it off, this clears delayed acks for us. */
- tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0);
+ __tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0, rcv_nxt);
+}
+EXPORT_SYMBOL_GPL(__tcp_send_ack);
+
+void tcp_send_ack(struct sock *sk)
+{
+ __tcp_send_ack(sk, tcp_sk(sk)->rcv_nxt);
}
-EXPORT_SYMBOL_GPL(tcp_send_ack);
/* This routine sends a packet with an out of date sequence
* number. It assumes the other end will try to ack it.
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index 71593e4400ab..c81aadff769b 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -25,7 +25,7 @@ static u32 tcp_rack_reo_wnd(const struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
- if (!tp->rack.reord) {
+ if (!tp->reord_seen) {
/* If reordering has not been observed, be aggressive during
* the recovery or starting the recovery by DUPACK threshold.
*/
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 3b3611729928..7fdf222a0bdf 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -22,6 +22,35 @@
#include <linux/gfp.h>
#include <net/tcp.h>
+static u32 tcp_retransmit_stamp(const struct sock *sk)
+{
+ u32 start_ts = tcp_sk(sk)->retrans_stamp;
+
+ if (unlikely(!start_ts)) {
+ struct sk_buff *head = tcp_rtx_queue_head(sk);
+
+ if (!head)
+ return 0;
+ start_ts = tcp_skb_timestamp(head);
+ }
+ return start_ts;
+}
+
+static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ u32 elapsed, start_ts;
+
+ start_ts = tcp_retransmit_stamp(sk);
+ if (!icsk->icsk_user_timeout || !start_ts)
+ return icsk->icsk_rto;
+ elapsed = tcp_time_stamp(tcp_sk(sk)) - start_ts;
+ if (elapsed >= icsk->icsk_user_timeout)
+ return 1; /* user timeout has passed; fire ASAP */
+ else
+ return min_t(u32, icsk->icsk_rto, msecs_to_jiffies(icsk->icsk_user_timeout - elapsed));
+}
+
/**
* tcp_write_err() - close socket and save error info
* @sk: The socket the error has appeared on.
@@ -166,14 +195,9 @@ static bool retransmits_timed_out(struct sock *sk,
if (!inet_csk(sk)->icsk_retransmits)
return false;
- start_ts = tcp_sk(sk)->retrans_stamp;
- if (unlikely(!start_ts)) {
- struct sk_buff *head = tcp_rtx_queue_head(sk);
-
- if (!head)
- return false;
- start_ts = tcp_skb_timestamp(head);
- }
+ start_ts = tcp_retransmit_stamp(sk);
+ if (!start_ts)
+ return false;
if (likely(timeout == 0)) {
linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base);
@@ -183,8 +207,9 @@ static bool retransmits_timed_out(struct sock *sk,
else
timeout = ((2 << linear_backoff_thresh) - 1) * rto_base +
(boundary - linear_backoff_thresh) * TCP_RTO_MAX;
+ timeout = jiffies_to_msecs(timeout);
}
- return (tcp_time_stamp(tcp_sk(sk)) - start_ts) >= jiffies_to_msecs(timeout);
+ return (tcp_time_stamp(tcp_sk(sk)) - start_ts) >= timeout;
}
/* A write timeout has occurred. Process the after effects. */
@@ -337,8 +362,7 @@ static void tcp_probe_timer(struct sock *sk)
if (!start_ts)
skb->skb_mstamp = tp->tcp_mstamp;
else if (icsk->icsk_user_timeout &&
- (s32)(tcp_time_stamp(tp) - start_ts) >
- jiffies_to_msecs(icsk->icsk_user_timeout))
+ (s32)(tcp_time_stamp(tp) - start_ts) > icsk->icsk_user_timeout)
goto abort;
max_probes = sock_net(sk)->ipv4.sysctl_tcp_retries2;
@@ -535,7 +559,8 @@ out_reset_timer:
/* Use normal (exponential) backoff */
icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
}
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ tcp_clamp_rto_to_user_timeout(sk), TCP_RTO_MAX);
if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1 + 1, 0))
__sk_dst_reset(sk);
@@ -672,7 +697,7 @@ static void tcp_keepalive_timer (struct timer_list *t)
* to determine when to timeout instead.
*/
if ((icsk->icsk_user_timeout != 0 &&
- elapsed >= icsk->icsk_user_timeout &&
+ elapsed >= msecs_to_jiffies(icsk->icsk_user_timeout) &&
icsk->icsk_probes_out > 0) ||
(icsk->icsk_user_timeout == 0 &&
icsk->icsk_probes_out >= keepalive_probes(tp))) {