summaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig4
-rw-r--r--net/ipv4/Makefile1
-rw-r--r--net/ipv4/datagram.c1
-rw-r--r--net/ipv4/devinet.c36
-rw-r--r--net/ipv4/gre_offload.c2
-rw-r--r--net/ipv4/icmp.c6
-rw-r--r--net/ipv4/igmp.c2
-rw-r--r--net/ipv4/inet_fragment.c318
-rw-r--r--net/ipv4/ip_fragment.c87
-rw-r--r--net/ipv4/ip_output.c13
-rw-r--r--net/ipv4/ip_sockglue.c2
-rw-r--r--net/ipv4/ip_tunnel.c31
-rw-r--r--net/ipv4/ip_vti.c54
-rw-r--r--net/ipv4/ipconfig.c5
-rw-r--r--net/ipv4/ipmr.c2
-rw-r--r--net/ipv4/netfilter/Kconfig29
-rw-r--r--net/ipv4/netfilter/Makefile5
-rw-r--r--net/ipv4/netfilter/ipt_ULOG.c498
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c6
-rw-r--r--net/ipv4/netfilter/nf_conntrack_proto_icmp.c4
-rw-r--r--net/ipv4/netfilter/nf_defrag_ipv4.c8
-rw-r--r--net/ipv4/netfilter/nf_log_arp.c149
-rw-r--r--net/ipv4/netfilter/nf_log_ipv4.c385
-rw-r--r--net/ipv4/netfilter/nf_nat_l3proto_ipv4.c4
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_gre.c2
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_icmp.c2
-rw-r--r--net/ipv4/ping.c2
-rw-r--r--net/ipv4/proc.c5
-rw-r--r--net/ipv4/raw.c9
-rw-r--r--net/ipv4/syncookies.c3
-rw-r--r--net/ipv4/tcp.c22
-rw-r--r--net/ipv4/tcp_input.c200
-rw-r--r--net/ipv4/tcp_ipv4.c201
-rw-r--r--net/ipv4/tcp_metrics.c1
-rw-r--r--net/ipv4/tcp_minisocks.c2
-rw-r--r--net/ipv4/tcp_offload.c18
-rw-r--r--net/ipv4/tcp_output.c18
-rw-r--r--net/ipv4/tcp_timer.c4
-rw-r--r--net/ipv4/tcp_vegas.c3
-rw-r--r--net/ipv4/tcp_veno.c2
-rw-r--r--net/ipv4/udp.c156
-rw-r--r--net/ipv4/udp_offload.c76
-rw-r--r--net/ipv4/udp_tunnel.c100
-rw-r--r--net/ipv4/xfrm4_protocol.c2
44 files changed, 1416 insertions, 1064 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 05c57f0fcabe..dbc10d84161f 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -307,6 +307,10 @@ config NET_IPVTI
the notion of a secure tunnel for IPSEC and then use routing protocol
on top.
+config NET_UDP_TUNNEL
+ tristate
+ default n
+
config INET_AH
tristate "IP: AH transformation"
select XFRM_ALGO
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index f032688d20d3..8ee1cd4053ee 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -22,6 +22,7 @@ obj-$(CONFIG_NET_IPIP) += ipip.o
gre-y := gre_demux.o
obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o
obj-$(CONFIG_NET_IPGRE) += ip_gre.o
+obj-$(CONFIG_NET_UDP_TUNNEL) += udp_tunnel.o
obj-$(CONFIG_NET_IPVTI) += ip_vti.o
obj-$(CONFIG_SYN_COOKIES) += syncookies.o
obj-$(CONFIG_INET_AH) += ah4.o
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index a3095fdefbed..90c0e8386116 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -76,6 +76,7 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
inet->inet_daddr = fl4->daddr;
inet->inet_dport = usin->sin_port;
sk->sk_state = TCP_ESTABLISHED;
+ inet_set_txhash(sk);
inet->inet_id = jiffies;
sk_dst_set(sk, &rt->dst);
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index e9449376b58e..214882e7d6de 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -180,11 +180,12 @@ static BLOCKING_NOTIFIER_HEAD(inetaddr_chain);
static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
int destroy);
#ifdef CONFIG_SYSCTL
-static void devinet_sysctl_register(struct in_device *idev);
+static int devinet_sysctl_register(struct in_device *idev);
static void devinet_sysctl_unregister(struct in_device *idev);
#else
-static void devinet_sysctl_register(struct in_device *idev)
+static int devinet_sysctl_register(struct in_device *idev)
{
+ return 0;
}
static void devinet_sysctl_unregister(struct in_device *idev)
{
@@ -232,6 +233,7 @@ EXPORT_SYMBOL(in_dev_finish_destroy);
static struct in_device *inetdev_init(struct net_device *dev)
{
struct in_device *in_dev;
+ int err = -ENOMEM;
ASSERT_RTNL();
@@ -252,7 +254,13 @@ static struct in_device *inetdev_init(struct net_device *dev)
/* Account for reference dev->ip_ptr (below) */
in_dev_hold(in_dev);
- devinet_sysctl_register(in_dev);
+ err = devinet_sysctl_register(in_dev);
+ if (err) {
+ in_dev->dead = 1;
+ in_dev_put(in_dev);
+ in_dev = NULL;
+ goto out;
+ }
ip_mc_init_dev(in_dev);
if (dev->flags & IFF_UP)
ip_mc_up(in_dev);
@@ -260,7 +268,7 @@ static struct in_device *inetdev_init(struct net_device *dev)
/* we can receive as soon as ip_ptr is set -- do this last */
rcu_assign_pointer(dev->ip_ptr, in_dev);
out:
- return in_dev;
+ return in_dev ?: ERR_PTR(err);
out_kfree:
kfree(in_dev);
in_dev = NULL;
@@ -1347,8 +1355,8 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
if (!in_dev) {
if (event == NETDEV_REGISTER) {
in_dev = inetdev_init(dev);
- if (!in_dev)
- return notifier_from_errno(-ENOMEM);
+ if (IS_ERR(in_dev))
+ return notifier_from_errno(PTR_ERR(in_dev));
if (dev->flags & IFF_LOOPBACK) {
IN_DEV_CONF_SET(in_dev, NOXFRM, 1);
IN_DEV_CONF_SET(in_dev, NOPOLICY, 1);
@@ -2182,11 +2190,21 @@ static void __devinet_sysctl_unregister(struct ipv4_devconf *cnf)
kfree(t);
}
-static void devinet_sysctl_register(struct in_device *idev)
+static int devinet_sysctl_register(struct in_device *idev)
{
- neigh_sysctl_register(idev->dev, idev->arp_parms, NULL);
- __devinet_sysctl_register(dev_net(idev->dev), idev->dev->name,
+ int err;
+
+ if (!sysctl_dev_name_is_allowed(idev->dev->name))
+ return -EINVAL;
+
+ err = neigh_sysctl_register(idev->dev, idev->arp_parms, NULL);
+ if (err)
+ return err;
+ err = __devinet_sysctl_register(dev_net(idev->dev), idev->dev->name,
&idev->cnf);
+ if (err)
+ neigh_sysctl_unregister(idev->arp_parms);
+ return err;
}
static void devinet_sysctl_unregister(struct in_device *idev)
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index f0bdd47bbbcb..6556263c8fa5 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -74,7 +74,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
/* segment inner packet. */
enc_features = skb->dev->hw_enc_features & netif_skb_features(skb);
segs = skb_mac_gso_segment(skb, enc_features);
- if (!segs || IS_ERR(segs)) {
+ if (IS_ERR_OR_NULL(segs)) {
skb_gso_error_unwind(skb, protocol, ghl, mac_offset, mac_len);
goto out;
}
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 42b7bcf8045b..ea7d4afe8205 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -663,16 +663,16 @@ static void icmp_socket_deliver(struct sk_buff *skb, u32 info)
/* Checkin full IP header plus 8 bytes of protocol to
* avoid additional coding at protocol handlers.
*/
- if (!pskb_may_pull(skb, iph->ihl * 4 + 8))
+ if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) {
+ ICMP_INC_STATS_BH(dev_net(skb->dev), ICMP_MIB_INERRORS);
return;
+ }
raw_icmp_error(skb, protocol, info);
- rcu_read_lock();
ipprot = rcu_dereference(inet_protos[protocol]);
if (ipprot && ipprot->err_handler)
ipprot->err_handler(skb, info);
- rcu_read_unlock();
}
static bool icmp_tag_validation(int proto)
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index db710b059bab..f10eab462282 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1321,7 +1321,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
atomic_set(&im->refcnt, 1);
spin_lock_init(&im->lock);
#ifdef CONFIG_IP_MULTICAST
- setup_timer(&im->timer, &igmp_timer_expire, (unsigned long)im);
+ setup_timer(&im->timer, igmp_timer_expire, (unsigned long)im);
im->unsolicit_count = IGMP_Unsolicited_Report_Count;
#endif
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 3b01959bf4bb..9eb89f3f0ee4 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -25,6 +25,12 @@
#include <net/inet_frag.h>
#include <net/inet_ecn.h>
+#define INETFRAGS_EVICT_BUCKETS 128
+#define INETFRAGS_EVICT_MAX 512
+
+/* don't rebuild inetfrag table with new secret more often than this */
+#define INETFRAGS_MIN_REBUILD_INTERVAL (5 * HZ)
+
/* Given the OR values of all fragments, apply RFC 3168 5.3 requirements
* Value : 0xff if frame should be dropped.
* 0 or INET_ECN_CE value, to be ORed in to final iph->tos field
@@ -46,24 +52,39 @@ const u8 ip_frag_ecn_table[16] = {
};
EXPORT_SYMBOL(ip_frag_ecn_table);
-static void inet_frag_secret_rebuild(unsigned long dummy)
+static unsigned int
+inet_frag_hashfn(const struct inet_frags *f, const struct inet_frag_queue *q)
+{
+ return f->hashfn(q) & (INETFRAGS_HASHSZ - 1);
+}
+
+static bool inet_frag_may_rebuild(struct inet_frags *f)
+{
+ return time_after(jiffies,
+ f->last_rebuild_jiffies + INETFRAGS_MIN_REBUILD_INTERVAL);
+}
+
+static void inet_frag_secret_rebuild(struct inet_frags *f)
{
- struct inet_frags *f = (struct inet_frags *)dummy;
- unsigned long now = jiffies;
int i;
- /* Per bucket lock NOT needed here, due to write lock protection */
- write_lock(&f->lock);
+ write_seqlock_bh(&f->rnd_seqlock);
+
+ if (!inet_frag_may_rebuild(f))
+ goto out;
get_random_bytes(&f->rnd, sizeof(u32));
+
for (i = 0; i < INETFRAGS_HASHSZ; i++) {
struct inet_frag_bucket *hb;
struct inet_frag_queue *q;
struct hlist_node *n;
hb = &f->hash[i];
+ spin_lock(&hb->chain_lock);
+
hlist_for_each_entry_safe(q, n, &hb->chain, list) {
- unsigned int hval = f->hashfn(q);
+ unsigned int hval = inet_frag_hashfn(f, q);
if (hval != i) {
struct inet_frag_bucket *hb_dest;
@@ -72,76 +93,200 @@ static void inet_frag_secret_rebuild(unsigned long dummy)
/* Relink to new hash chain. */
hb_dest = &f->hash[hval];
+
+ /* This is the only place where we take
+ * another chain_lock while already holding
+ * one. As this will not run concurrently,
+ * we cannot deadlock on hb_dest lock below, if its
+ * already locked it will be released soon since
+ * other caller cannot be waiting for hb lock
+ * that we've taken above.
+ */
+ spin_lock_nested(&hb_dest->chain_lock,
+ SINGLE_DEPTH_NESTING);
hlist_add_head(&q->list, &hb_dest->chain);
+ spin_unlock(&hb_dest->chain_lock);
}
}
+ spin_unlock(&hb->chain_lock);
+ }
+
+ f->rebuild = false;
+ f->last_rebuild_jiffies = jiffies;
+out:
+ write_sequnlock_bh(&f->rnd_seqlock);
+}
+
+static bool inet_fragq_should_evict(const struct inet_frag_queue *q)
+{
+ return q->net->low_thresh == 0 ||
+ frag_mem_limit(q->net) >= q->net->low_thresh;
+}
+
+static unsigned int
+inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb)
+{
+ struct inet_frag_queue *fq;
+ struct hlist_node *n;
+ unsigned int evicted = 0;
+ HLIST_HEAD(expired);
+
+evict_again:
+ spin_lock(&hb->chain_lock);
+
+ hlist_for_each_entry_safe(fq, n, &hb->chain, list) {
+ if (!inet_fragq_should_evict(fq))
+ continue;
+
+ if (!del_timer(&fq->timer)) {
+ /* q expiring right now thus increment its refcount so
+ * it won't be freed under us and wait until the timer
+ * has finished executing then destroy it
+ */
+ atomic_inc(&fq->refcnt);
+ spin_unlock(&hb->chain_lock);
+ del_timer_sync(&fq->timer);
+ WARN_ON(atomic_read(&fq->refcnt) != 1);
+ inet_frag_put(fq, f);
+ goto evict_again;
+ }
+
+ fq->flags |= INET_FRAG_EVICTED;
+ hlist_del(&fq->list);
+ hlist_add_head(&fq->list, &expired);
+ ++evicted;
}
- write_unlock(&f->lock);
- mod_timer(&f->secret_timer, now + f->secret_interval);
+ spin_unlock(&hb->chain_lock);
+
+ hlist_for_each_entry_safe(fq, n, &expired, list)
+ f->frag_expire((unsigned long) fq);
+
+ return evicted;
}
-void inet_frags_init(struct inet_frags *f)
+static void inet_frag_worker(struct work_struct *work)
+{
+ unsigned int budget = INETFRAGS_EVICT_BUCKETS;
+ unsigned int i, evicted = 0;
+ struct inet_frags *f;
+
+ f = container_of(work, struct inet_frags, frags_work);
+
+ BUILD_BUG_ON(INETFRAGS_EVICT_BUCKETS >= INETFRAGS_HASHSZ);
+
+ local_bh_disable();
+
+ for (i = ACCESS_ONCE(f->next_bucket); budget; --budget) {
+ evicted += inet_evict_bucket(f, &f->hash[i]);
+ i = (i + 1) & (INETFRAGS_HASHSZ - 1);
+ if (evicted > INETFRAGS_EVICT_MAX)
+ break;
+ }
+
+ f->next_bucket = i;
+
+ local_bh_enable();
+
+ if (f->rebuild && inet_frag_may_rebuild(f))
+ inet_frag_secret_rebuild(f);
+}
+
+static void inet_frag_schedule_worker(struct inet_frags *f)
+{
+ if (unlikely(!work_pending(&f->frags_work)))
+ schedule_work(&f->frags_work);
+}
+
+int inet_frags_init(struct inet_frags *f)
{
int i;
+ INIT_WORK(&f->frags_work, inet_frag_worker);
+
for (i = 0; i < INETFRAGS_HASHSZ; i++) {
struct inet_frag_bucket *hb = &f->hash[i];
spin_lock_init(&hb->chain_lock);
INIT_HLIST_HEAD(&hb->chain);
}
- rwlock_init(&f->lock);
- setup_timer(&f->secret_timer, inet_frag_secret_rebuild,
- (unsigned long)f);
- f->secret_timer.expires = jiffies + f->secret_interval;
- add_timer(&f->secret_timer);
+ seqlock_init(&f->rnd_seqlock);
+ f->last_rebuild_jiffies = 0;
+ f->frags_cachep = kmem_cache_create(f->frags_cache_name, f->qsize, 0, 0,
+ NULL);
+ if (!f->frags_cachep)
+ return -ENOMEM;
+
+ return 0;
}
EXPORT_SYMBOL(inet_frags_init);
void inet_frags_init_net(struct netns_frags *nf)
{
- nf->nqueues = 0;
init_frag_mem_limit(nf);
- INIT_LIST_HEAD(&nf->lru_list);
- spin_lock_init(&nf->lru_lock);
}
EXPORT_SYMBOL(inet_frags_init_net);
void inet_frags_fini(struct inet_frags *f)
{
- del_timer(&f->secret_timer);
+ cancel_work_sync(&f->frags_work);
+ kmem_cache_destroy(f->frags_cachep);
}
EXPORT_SYMBOL(inet_frags_fini);
void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
{
- nf->low_thresh = 0;
+ unsigned int seq;
+ int i;
+ nf->low_thresh = 0;
local_bh_disable();
- inet_frag_evictor(nf, f, true);
+
+evict_again:
+ seq = read_seqbegin(&f->rnd_seqlock);
+
+ for (i = 0; i < INETFRAGS_HASHSZ ; i++)
+ inet_evict_bucket(f, &f->hash[i]);
+
+ if (read_seqretry(&f->rnd_seqlock, seq))
+ goto evict_again;
+
local_bh_enable();
percpu_counter_destroy(&nf->mem);
}
EXPORT_SYMBOL(inet_frags_exit_net);
-static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f)
+static struct inet_frag_bucket *
+get_frag_bucket_locked(struct inet_frag_queue *fq, struct inet_frags *f)
+__acquires(hb->chain_lock)
{
struct inet_frag_bucket *hb;
- unsigned int hash;
+ unsigned int seq, hash;
+
+ restart:
+ seq = read_seqbegin(&f->rnd_seqlock);
- read_lock(&f->lock);
- hash = f->hashfn(fq);
+ hash = inet_frag_hashfn(f, fq);
hb = &f->hash[hash];
spin_lock(&hb->chain_lock);
+ if (read_seqretry(&f->rnd_seqlock, seq)) {
+ spin_unlock(&hb->chain_lock);
+ goto restart;
+ }
+
+ return hb;
+}
+
+static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f)
+{
+ struct inet_frag_bucket *hb;
+
+ hb = get_frag_bucket_locked(fq, f);
hlist_del(&fq->list);
spin_unlock(&hb->chain_lock);
-
- read_unlock(&f->lock);
- inet_frag_lru_del(fq);
}
void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f)
@@ -149,30 +294,29 @@ void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f)
if (del_timer(&fq->timer))
atomic_dec(&fq->refcnt);
- if (!(fq->last_in & INET_FRAG_COMPLETE)) {
+ if (!(fq->flags & INET_FRAG_COMPLETE)) {
fq_unlink(fq, f);
atomic_dec(&fq->refcnt);
- fq->last_in |= INET_FRAG_COMPLETE;
+ fq->flags |= INET_FRAG_COMPLETE;
}
}
EXPORT_SYMBOL(inet_frag_kill);
static inline void frag_kfree_skb(struct netns_frags *nf, struct inet_frags *f,
- struct sk_buff *skb)
+ struct sk_buff *skb)
{
if (f->skb_free)
f->skb_free(skb);
kfree_skb(skb);
}
-void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f,
- int *work)
+void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f)
{
struct sk_buff *fp;
struct netns_frags *nf;
unsigned int sum, sum_truesize = 0;
- WARN_ON(!(q->last_in & INET_FRAG_COMPLETE));
+ WARN_ON(!(q->flags & INET_FRAG_COMPLETE));
WARN_ON(del_timer(&q->timer) != 0);
/* Release all fragment data. */
@@ -186,87 +330,32 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f,
fp = xp;
}
sum = sum_truesize + f->qsize;
- if (work)
- *work -= sum;
sub_frag_mem_limit(q, sum);
if (f->destructor)
f->destructor(q);
- kfree(q);
-
+ kmem_cache_free(f->frags_cachep, q);
}
EXPORT_SYMBOL(inet_frag_destroy);
-int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force)
-{
- struct inet_frag_queue *q;
- int work, evicted = 0;
-
- if (!force) {
- if (frag_mem_limit(nf) <= nf->high_thresh)
- return 0;
- }
-
- work = frag_mem_limit(nf) - nf->low_thresh;
- while (work > 0 || force) {
- spin_lock(&nf->lru_lock);
-
- if (list_empty(&nf->lru_list)) {
- spin_unlock(&nf->lru_lock);
- break;
- }
-
- q = list_first_entry(&nf->lru_list,
- struct inet_frag_queue, lru_list);
- atomic_inc(&q->refcnt);
- /* Remove q from list to avoid several CPUs grabbing it */
- list_del_init(&q->lru_list);
-
- spin_unlock(&nf->lru_lock);
-
- spin_lock(&q->lock);
- if (!(q->last_in & INET_FRAG_COMPLETE))
- inet_frag_kill(q, f);
- spin_unlock(&q->lock);
-
- if (atomic_dec_and_test(&q->refcnt))
- inet_frag_destroy(q, f, &work);
- evicted++;
- }
-
- return evicted;
-}
-EXPORT_SYMBOL(inet_frag_evictor);
-
static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
- struct inet_frag_queue *qp_in, struct inet_frags *f,
- void *arg)
+ struct inet_frag_queue *qp_in,
+ struct inet_frags *f,
+ void *arg)
{
- struct inet_frag_bucket *hb;
+ struct inet_frag_bucket *hb = get_frag_bucket_locked(qp_in, f);
struct inet_frag_queue *qp;
- unsigned int hash;
-
- read_lock(&f->lock); /* Protects against hash rebuild */
- /*
- * While we stayed w/o the lock other CPU could update
- * the rnd seed, so we need to re-calculate the hash
- * chain. Fortunatelly the qp_in can be used to get one.
- */
- hash = f->hashfn(qp_in);
- hb = &f->hash[hash];
- spin_lock(&hb->chain_lock);
#ifdef CONFIG_SMP
/* With SMP race we have to recheck hash table, because
- * such entry could be created on other cpu, while we
- * released the hash bucket lock.
+ * such entry could have been created on other cpu before
+ * we acquired hash bucket lock.
*/
hlist_for_each_entry(qp, &hb->chain, list) {
if (qp->net == nf && f->match(qp, arg)) {
atomic_inc(&qp->refcnt);
spin_unlock(&hb->chain_lock);
- read_unlock(&f->lock);
- qp_in->last_in |= INET_FRAG_COMPLETE;
+ qp_in->flags |= INET_FRAG_COMPLETE;
inet_frag_put(qp_in, f);
return qp;
}
@@ -278,19 +367,24 @@ static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
atomic_inc(&qp->refcnt);
hlist_add_head(&qp->list, &hb->chain);
- inet_frag_lru_add(nf, qp);
+
spin_unlock(&hb->chain_lock);
- read_unlock(&f->lock);
return qp;
}
static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
- struct inet_frags *f, void *arg)
+ struct inet_frags *f,
+ void *arg)
{
struct inet_frag_queue *q;
- q = kzalloc(f->qsize, GFP_ATOMIC);
+ if (frag_mem_limit(nf) > nf->high_thresh) {
+ inet_frag_schedule_worker(f);
+ return NULL;
+ }
+
+ q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC);
if (q == NULL)
return NULL;
@@ -301,13 +395,13 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
setup_timer(&q->timer, f->frag_expire, (unsigned long)q);
spin_lock_init(&q->lock);
atomic_set(&q->refcnt, 1);
- INIT_LIST_HEAD(&q->lru_list);
return q;
}
static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
- struct inet_frags *f, void *arg)
+ struct inet_frags *f,
+ void *arg)
{
struct inet_frag_queue *q;
@@ -319,13 +413,17 @@ static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
}
struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
- struct inet_frags *f, void *key, unsigned int hash)
- __releases(&f->lock)
+ struct inet_frags *f, void *key,
+ unsigned int hash)
{
struct inet_frag_bucket *hb;
struct inet_frag_queue *q;
int depth = 0;
+ if (frag_mem_limit(nf) > nf->low_thresh)
+ inet_frag_schedule_worker(f);
+
+ hash &= (INETFRAGS_HASHSZ - 1);
hb = &f->hash[hash];
spin_lock(&hb->chain_lock);
@@ -333,18 +431,22 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
if (q->net == nf && f->match(q, key)) {
atomic_inc(&q->refcnt);
spin_unlock(&hb->chain_lock);
- read_unlock(&f->lock);
return q;
}
depth++;
}
spin_unlock(&hb->chain_lock);
- read_unlock(&f->lock);
if (depth <= INETFRAGS_MAXDEPTH)
return inet_frag_create(nf, f, key);
- else
- return ERR_PTR(-ENOBUFS);
+
+ if (inet_frag_may_rebuild(f)) {
+ if (!f->rebuild)
+ f->rebuild = true;
+ inet_frag_schedule_worker(f);
+ }
+
+ return ERR_PTR(-ENOBUFS);
}
EXPORT_SYMBOL(inet_frag_find);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index ed32313e307c..15f0e2bad7ad 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -55,6 +55,7 @@
*/
static int sysctl_ipfrag_max_dist __read_mostly = 64;
+static const char ip_frag_cache_name[] = "ip4-frags";
struct ipfrag_skb_cb
{
@@ -86,11 +87,6 @@ static inline u8 ip4_frag_ecn(u8 tos)
static struct inet_frags ip4_frags;
-int ip_frag_nqueues(struct net *net)
-{
- return net->ipv4.frags.nqueues;
-}
-
int ip_frag_mem(struct net *net)
{
return sum_frag_mem_limit(&net->ipv4.frags);
@@ -109,21 +105,21 @@ static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot)
net_get_random_once(&ip4_frags.rnd, sizeof(ip4_frags.rnd));
return jhash_3words((__force u32)id << 16 | prot,
(__force u32)saddr, (__force u32)daddr,
- ip4_frags.rnd) & (INETFRAGS_HASHSZ - 1);
+ ip4_frags.rnd);
}
-static unsigned int ip4_hashfn(struct inet_frag_queue *q)
+static unsigned int ip4_hashfn(const struct inet_frag_queue *q)
{
- struct ipq *ipq;
+ const struct ipq *ipq;
ipq = container_of(q, struct ipq, q);
return ipqhashfn(ipq->id, ipq->saddr, ipq->daddr, ipq->protocol);
}
-static bool ip4_frag_match(struct inet_frag_queue *q, void *a)
+static bool ip4_frag_match(const struct inet_frag_queue *q, const void *a)
{
- struct ipq *qp;
- struct ip4_create_arg *arg = a;
+ const struct ipq *qp;
+ const struct ip4_create_arg *arg = a;
qp = container_of(q, struct ipq, q);
return qp->id == arg->iph->id &&
@@ -133,14 +129,14 @@ static bool ip4_frag_match(struct inet_frag_queue *q, void *a)
qp->user == arg->user;
}
-static void ip4_frag_init(struct inet_frag_queue *q, void *a)
+static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
{
struct ipq *qp = container_of(q, struct ipq, q);
struct netns_ipv4 *ipv4 = container_of(q->net, struct netns_ipv4,
frags);
struct net *net = container_of(ipv4, struct net, ipv4);
- struct ip4_create_arg *arg = a;
+ const struct ip4_create_arg *arg = a;
qp->protocol = arg->iph->protocol;
qp->id = arg->iph->id;
@@ -177,18 +173,6 @@ static void ipq_kill(struct ipq *ipq)
inet_frag_kill(&ipq->q, &ip4_frags);
}
-/* Memory limiting on fragments. Evictor trashes the oldest
- * fragment queue until we are back under the threshold.
- */
-static void ip_evictor(struct net *net)
-{
- int evicted;
-
- evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags, false);
- if (evicted)
- IP_ADD_STATS_BH(net, IPSTATS_MIB_REASMFAILS, evicted);
-}
-
/*
* Oops, a fragment queue timed out. Kill it and send an ICMP reply.
*/
@@ -202,19 +186,22 @@ static void ip_expire(unsigned long arg)
spin_lock(&qp->q.lock);
- if (qp->q.last_in & INET_FRAG_COMPLETE)
+ if (qp->q.flags & INET_FRAG_COMPLETE)
goto out;
ipq_kill(qp);
-
- IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT);
IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
- if ((qp->q.last_in & INET_FRAG_FIRST_IN) && qp->q.fragments != NULL) {
+ if (!(qp->q.flags & INET_FRAG_EVICTED)) {
struct sk_buff *head = qp->q.fragments;
const struct iphdr *iph;
int err;
+ IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT);
+
+ if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !qp->q.fragments)
+ goto out;
+
rcu_read_lock();
head->dev = dev_get_by_index_rcu(net, qp->iif);
if (!head->dev)
@@ -227,8 +214,7 @@ static void ip_expire(unsigned long arg)
if (err)
goto out_rcu_unlock;
- /*
- * Only an end host needs to send an ICMP
+ /* Only an end host needs to send an ICMP
* "Fragment Reassembly Timeout" message, per RFC792.
*/
if (qp->user == IP_DEFRAG_AF_PACKET ||
@@ -237,7 +223,6 @@ static void ip_expire(unsigned long arg)
(skb_rtable(head)->rt_type != RTN_LOCAL)))
goto out_rcu_unlock;
-
/* Send an ICMP "Fragment Reassembly Timeout" message. */
icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
out_rcu_unlock:
@@ -260,7 +245,6 @@ static inline struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user)
arg.iph = iph;
arg.user = user;
- read_lock(&ip4_frags.lock);
hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol);
q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash);
@@ -319,7 +303,7 @@ static int ip_frag_reinit(struct ipq *qp)
} while (fp);
sub_frag_mem_limit(&qp->q, sum_truesize);
- qp->q.last_in = 0;
+ qp->q.flags = 0;
qp->q.len = 0;
qp->q.meat = 0;
qp->q.fragments = NULL;
@@ -340,7 +324,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
int err = -ENOENT;
u8 ecn;
- if (qp->q.last_in & INET_FRAG_COMPLETE)
+ if (qp->q.flags & INET_FRAG_COMPLETE)
goto err;
if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) &&
@@ -367,9 +351,9 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
* or have different end, the segment is corrupted.
*/
if (end < qp->q.len ||
- ((qp->q.last_in & INET_FRAG_LAST_IN) && end != qp->q.len))
+ ((qp->q.flags & INET_FRAG_LAST_IN) && end != qp->q.len))
goto err;
- qp->q.last_in |= INET_FRAG_LAST_IN;
+ qp->q.flags |= INET_FRAG_LAST_IN;
qp->q.len = end;
} else {
if (end&7) {
@@ -379,7 +363,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
}
if (end > qp->q.len) {
/* Some bits beyond end -> corruption. */
- if (qp->q.last_in & INET_FRAG_LAST_IN)
+ if (qp->q.flags & INET_FRAG_LAST_IN)
goto err;
qp->q.len = end;
}
@@ -488,13 +472,13 @@ found:
qp->ecn |= ecn;
add_frag_mem_limit(&qp->q, skb->truesize);
if (offset == 0)
- qp->q.last_in |= INET_FRAG_FIRST_IN;
+ qp->q.flags |= INET_FRAG_FIRST_IN;
if (ip_hdr(skb)->frag_off & htons(IP_DF) &&
skb->len + ihl > qp->q.max_size)
qp->q.max_size = skb->len + ihl;
- if (qp->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
+ if (qp->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
qp->q.meat == qp->q.len) {
unsigned long orefdst = skb->_skb_refdst;
@@ -505,7 +489,6 @@ found:
}
skb_dst_drop(skb);
- inet_frag_lru_move(&qp->q);
return -EINPROGRESS;
err:
@@ -655,9 +638,6 @@ int ip_defrag(struct sk_buff *skb, u32 user)
net = skb->dev ? dev_net(skb->dev) : dev_net(skb_dst(skb)->dev);
IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS);
- /* Start by cleaning up the memory. */
- ip_evictor(net);
-
/* Lookup (or create) queue header */
if ((qp = ip_find(net, ip_hdr(skb), user)) != NULL) {
int ret;
@@ -721,14 +701,17 @@ static struct ctl_table ip4_frags_ns_ctl_table[] = {
.data = &init_net.ipv4.frags.high_thresh,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &init_net.ipv4.frags.low_thresh
},
{
.procname = "ipfrag_low_thresh",
.data = &init_net.ipv4.frags.low_thresh,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &init_net.ipv4.frags.high_thresh
},
{
.procname = "ipfrag_time",
@@ -740,10 +723,12 @@ static struct ctl_table ip4_frags_ns_ctl_table[] = {
{ }
};
+/* secret interval has been deprecated */
+static int ip4_frags_secret_interval_unused;
static struct ctl_table ip4_frags_ctl_table[] = {
{
.procname = "ipfrag_secret_interval",
- .data = &ip4_frags.secret_interval,
+ .data = &ip4_frags_secret_interval_unused,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
@@ -771,7 +756,10 @@ static int __net_init ip4_frags_ns_ctl_register(struct net *net)
goto err_alloc;
table[0].data = &net->ipv4.frags.high_thresh;
+ table[0].extra1 = &net->ipv4.frags.low_thresh;
+ table[0].extra2 = &init_net.ipv4.frags.high_thresh;
table[1].data = &net->ipv4.frags.low_thresh;
+ table[1].extra2 = &net->ipv4.frags.high_thresh;
table[2].data = &net->ipv4.frags.timeout;
/* Don't export sysctls to unprivileged users */
@@ -873,6 +861,7 @@ void __init ipfrag_init(void)
ip4_frags.qsize = sizeof(struct ipq);
ip4_frags.match = ip4_frag_match;
ip4_frags.frag_expire = ip_expire;
- ip4_frags.secret_interval = 10 * 60 * HZ;
- inet_frags_init(&ip4_frags);
+ ip4_frags.frags_cache_name = ip_frag_cache_name;
+ if (inet_frags_init(&ip4_frags))
+ panic("IP: failed to allocate ip4_frags cache\n");
}
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 8d3b6b0e9857..215af2b155cb 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -855,11 +855,15 @@ static int __ip_append_data(struct sock *sk,
unsigned int maxfraglen, fragheaderlen, maxnonfragsize;
int csummode = CHECKSUM_NONE;
struct rtable *rt = (struct rtable *)cork->dst;
+ u32 tskey = 0;
skb = skb_peek_tail(queue);
exthdrlen = !skb ? rt->dst.header_len : 0;
mtu = cork->fragsize;
+ if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
+ sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
+ tskey = sk->sk_tskey++;
hh_len = LL_RESERVED_SPACE(rt->dst.dev);
@@ -962,10 +966,6 @@ alloc_new_skb:
sk->sk_allocation);
if (unlikely(skb == NULL))
err = -ENOBUFS;
- else
- /* only the initial fragment is
- time stamped */
- cork->tx_flags = 0;
}
if (skb == NULL)
goto error;
@@ -976,7 +976,12 @@ alloc_new_skb:
skb->ip_summed = csummode;
skb->csum = 0;
skb_reserve(skb, hh_len);
+
+ /* only the initial fragment is time stamped */
skb_shinfo(skb)->tx_flags = cork->tx_flags;
+ cork->tx_flags = 0;
+ skb_shinfo(skb)->tskey = tskey;
+ tskey = 0;
/*
* Find where to start putting bytes.
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 64741b938632..5cb830c78990 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -1319,7 +1319,7 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
if (sk->sk_type != SOCK_STREAM)
return -ENOPROTOOPT;
- msg.msg_control = optval;
+ msg.msg_control = (__force void *) optval;
msg.msg_controllen = len;
msg.msg_flags = flags;
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 6f9de61dce5f..afed1aac2638 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -69,23 +69,25 @@ static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
}
static void __tunnel_dst_set(struct ip_tunnel_dst *idst,
- struct dst_entry *dst)
+ struct dst_entry *dst, __be32 saddr)
{
struct dst_entry *old_dst;
dst_clone(dst);
old_dst = xchg((__force struct dst_entry **)&idst->dst, dst);
dst_release(old_dst);
+ idst->saddr = saddr;
}
-static void tunnel_dst_set(struct ip_tunnel *t, struct dst_entry *dst)
+static void tunnel_dst_set(struct ip_tunnel *t,
+ struct dst_entry *dst, __be32 saddr)
{
- __tunnel_dst_set(this_cpu_ptr(t->dst_cache), dst);
+ __tunnel_dst_set(this_cpu_ptr(t->dst_cache), dst, saddr);
}
static void tunnel_dst_reset(struct ip_tunnel *t)
{
- tunnel_dst_set(t, NULL);
+ tunnel_dst_set(t, NULL, 0);
}
void ip_tunnel_dst_reset_all(struct ip_tunnel *t)
@@ -93,20 +95,25 @@ void ip_tunnel_dst_reset_all(struct ip_tunnel *t)
int i;
for_each_possible_cpu(i)
- __tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL);
+ __tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL, 0);
}
EXPORT_SYMBOL(ip_tunnel_dst_reset_all);
-static struct rtable *tunnel_rtable_get(struct ip_tunnel *t, u32 cookie)
+static struct rtable *tunnel_rtable_get(struct ip_tunnel *t,
+ u32 cookie, __be32 *saddr)
{
+ struct ip_tunnel_dst *idst;
struct dst_entry *dst;
rcu_read_lock();
- dst = rcu_dereference(this_cpu_ptr(t->dst_cache)->dst);
+ idst = this_cpu_ptr(t->dst_cache);
+ dst = rcu_dereference(idst->dst);
if (dst && !atomic_inc_not_zero(&dst->__refcnt))
dst = NULL;
if (dst) {
- if (dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
+ if (!dst->obsolete || dst->ops->check(dst, cookie)) {
+ *saddr = idst->saddr;
+ } else {
tunnel_dst_reset(t);
dst_release(dst);
dst = NULL;
@@ -305,7 +312,7 @@ static struct net_device *__ip_tunnel_create(struct net *net,
}
ASSERT_RTNL();
- dev = alloc_netdev(ops->priv_size, name, ops->setup);
+ dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
if (!dev) {
err = -ENOMEM;
goto failed;
@@ -367,7 +374,7 @@ static int ip_tunnel_bind_dev(struct net_device *dev)
if (!IS_ERR(rt)) {
tdev = rt->dst.dev;
- tunnel_dst_set(tunnel, &rt->dst);
+ tunnel_dst_set(tunnel, &rt->dst, fl4.saddr);
ip_rt_put(rt);
}
if (dev->type != ARPHRD_ETHER)
@@ -610,7 +617,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
- rt = connected ? tunnel_rtable_get(tunnel, 0) : NULL;
+ rt = connected ? tunnel_rtable_get(tunnel, 0, &fl4.saddr) : NULL;
if (!rt) {
rt = ip_route_output_key(tunnel->net, &fl4);
@@ -620,7 +627,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
goto tx_error;
}
if (connected)
- tunnel_dst_set(tunnel, &rt->dst);
+ tunnel_dst_set(tunnel, &rt->dst, fl4.saddr);
}
if (rt->dst.dev == dev) {
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index b8960f3527f3..e453cb724a95 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -534,40 +534,28 @@ static struct rtnl_link_ops vti_link_ops __read_mostly = {
static int __init vti_init(void)
{
+ const char *msg;
int err;
- pr_info("IPv4 over IPSec tunneling driver\n");
+ pr_info("IPv4 over IPsec tunneling driver\n");
+ msg = "tunnel device";
err = register_pernet_device(&vti_net_ops);
if (err < 0)
- return err;
- err = xfrm4_protocol_register(&vti_esp4_protocol, IPPROTO_ESP);
- if (err < 0) {
- unregister_pernet_device(&vti_net_ops);
- pr_info("vti init: can't register tunnel\n");
-
- return err;
- }
+ goto pernet_dev_failed;
+ msg = "tunnel protocols";
+ err = xfrm4_protocol_register(&vti_esp4_protocol, IPPROTO_ESP);
+ if (err < 0)
+ goto xfrm_proto_esp_failed;
err = xfrm4_protocol_register(&vti_ah4_protocol, IPPROTO_AH);
- if (err < 0) {
- xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP);
- unregister_pernet_device(&vti_net_ops);
- pr_info("vti init: can't register tunnel\n");
-
- return err;
- }
-
+ if (err < 0)
+ goto xfrm_proto_ah_failed;
err = xfrm4_protocol_register(&vti_ipcomp4_protocol, IPPROTO_COMP);
- if (err < 0) {
- xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH);
- xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP);
- unregister_pernet_device(&vti_net_ops);
- pr_info("vti init: can't register tunnel\n");
-
- return err;
- }
+ if (err < 0)
+ goto xfrm_proto_comp_failed;
+ msg = "netlink interface";
err = rtnl_link_register(&vti_link_ops);
if (err < 0)
goto rtnl_link_failed;
@@ -576,23 +564,23 @@ static int __init vti_init(void)
rtnl_link_failed:
xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP);
+xfrm_proto_comp_failed:
xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH);
+xfrm_proto_ah_failed:
xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP);
+xfrm_proto_esp_failed:
unregister_pernet_device(&vti_net_ops);
+pernet_dev_failed:
+ pr_err("vti init: failed to register %s\n", msg);
return err;
}
static void __exit vti_fini(void)
{
rtnl_link_unregister(&vti_link_ops);
- if (xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP))
- pr_info("vti close: can't deregister tunnel\n");
- if (xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH))
- pr_info("vti close: can't deregister tunnel\n");
- if (xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP))
- pr_info("vti close: can't deregister tunnel\n");
-
-
+ xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP);
+ xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH);
+ xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP);
unregister_pernet_device(&vti_net_ops);
}
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index b3e86ea7b71b..5bbef4fdcb43 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -143,8 +143,6 @@ __be32 ic_servaddr = NONE; /* Boot server IP address */
__be32 root_server_addr = NONE; /* Address of NFS server */
u8 root_server_path[256] = { 0, }; /* Path to mount as root */
-__be32 ic_dev_xid; /* Device under configuration */
-
/* vendor class identifier */
static char vendor_class_identifier[253] __initdata;
@@ -654,6 +652,7 @@ static struct packet_type bootp_packet_type __initdata = {
.func = ic_bootp_recv,
};
+static __be32 ic_dev_xid; /* Device under configuration */
/*
* Initialize DHCP/BOOTP extension fields in the request.
@@ -1218,10 +1217,10 @@ static int __init ic_dynamic(void)
get_random_bytes(&timeout, sizeof(timeout));
timeout = CONF_BASE_TIMEOUT + (timeout % (unsigned int) CONF_TIMEOUT_RANDOM);
for (;;) {
+#ifdef IPCONFIG_BOOTP
/* Track the device we are configuring */
ic_dev_xid = d->xid;
-#ifdef IPCONFIG_BOOTP
if (do_bootp && (d->able & IC_BOOTP))
ic_bootp_send_if(d, jiffies - start_jiffies);
#endif
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 65bcaa789043..c8034587859d 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -500,7 +500,7 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
else
sprintf(name, "pimreg%u", mrt->id);
- dev = alloc_netdev(0, name, reg_vif_setup);
+ dev = alloc_netdev(0, name, NET_NAME_UNKNOWN, reg_vif_setup);
if (dev == NULL)
return NULL;
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index a26ce035e3fa..fb173126f03d 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -36,6 +36,16 @@ config NF_CONNTRACK_PROC_COMPAT
If unsure, say Y.
+config NF_LOG_ARP
+ tristate "ARP packet logging"
+ default m if NETFILTER_ADVANCED=n
+ select NF_LOG_COMMON
+
+config NF_LOG_IPV4
+ tristate "IPv4 packet logging"
+ default m if NETFILTER_ADVANCED=n
+ select NF_LOG_COMMON
+
config NF_TABLES_IPV4
depends on NF_TABLES
tristate "IPv4 nf_tables support"
@@ -159,25 +169,6 @@ config IP_NF_TARGET_SYNPROXY
To compile it as a module, choose M here. If unsure, say N.
-config IP_NF_TARGET_ULOG
- tristate "ULOG target support (obsolete)"
- default m if NETFILTER_ADVANCED=n
- ---help---
-
- This option enables the old IPv4-only "ipt_ULOG" implementation
- which has been obsoleted by the new "nfnetlink_log" code (see
- CONFIG_NETFILTER_NETLINK_LOG).
-
- This option adds a `ULOG' target, which allows you to create rules in
- any iptables table. The packet is passed to a userspace logging
- daemon using netlink multicast sockets; unlike the LOG target
- which can only be viewed through syslog.
-
- The appropriate userspace logging daemon (ulogd) may be obtained from
- <http://www.netfilter.org/projects/ulogd/index.html>
-
- To compile it as a module, choose M here. If unsure, say N.
-
# NAT + specific targets: nf_conntrack
config NF_NAT_IPV4
tristate "IPv4 NAT"
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 90b82405331e..33001621465b 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -19,6 +19,10 @@ obj-$(CONFIG_NF_NAT_IPV4) += nf_nat_ipv4.o
# defrag
obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o
+# logging
+obj-$(CONFIG_NF_LOG_ARP) += nf_log_arp.o
+obj-$(CONFIG_NF_LOG_IPV4) += nf_log_ipv4.o
+
# NAT helpers (nf_conntrack)
obj-$(CONFIG_NF_NAT_H323) += nf_nat_h323.o
obj-$(CONFIG_NF_NAT_PPTP) += nf_nat_pptp.o
@@ -53,7 +57,6 @@ obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o
obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o
obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o
obj-$(CONFIG_IP_NF_TARGET_SYNPROXY) += ipt_SYNPROXY.o
-obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o
# generic ARP tables
obj-$(CONFIG_IP_NF_ARPTABLES) += arp_tables.o
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
deleted file mode 100644
index 9cb993cd224b..000000000000
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ /dev/null
@@ -1,498 +0,0 @@
-/*
- * netfilter module for userspace packet logging daemons
- *
- * (C) 2000-2004 by Harald Welte <laforge@netfilter.org>
- * (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- * (C) 2005-2007 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This module accepts two parameters:
- *
- * nlbufsiz:
- * The parameter specifies how big the buffer for each netlink multicast
- * group is. e.g. If you say nlbufsiz=8192, up to eight kb of packets will
- * get accumulated in the kernel until they are sent to userspace. It is
- * NOT possible to allocate more than 128kB, and it is strongly discouraged,
- * because atomically allocating 128kB inside the network rx softirq is not
- * reliable. Please also keep in mind that this buffer size is allocated for
- * each nlgroup you are using, so the total kernel memory usage increases
- * by that factor.
- *
- * Actually you should use nlbufsiz a bit smaller than PAGE_SIZE, since
- * nlbufsiz is used with alloc_skb, which adds another
- * sizeof(struct skb_shared_info). Use NLMSG_GOODSIZE instead.
- *
- * flushtimeout:
- * Specify, after how many hundredths of a second the queue should be
- * flushed even if it is not full yet.
- */
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/module.h>
-#include <linux/spinlock.h>
-#include <linux/socket.h>
-#include <linux/slab.h>
-#include <linux/skbuff.h>
-#include <linux/kernel.h>
-#include <linux/timer.h>
-#include <net/netlink.h>
-#include <linux/netdevice.h>
-#include <linux/mm.h>
-#include <linux/moduleparam.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter/x_tables.h>
-#include <linux/netfilter_ipv4/ipt_ULOG.h>
-#include <net/netfilter/nf_log.h>
-#include <net/netns/generic.h>
-#include <net/sock.h>
-#include <linux/bitops.h>
-#include <asm/unaligned.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
-MODULE_DESCRIPTION("Xtables: packet logging to netlink using ULOG");
-MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NFLOG);
-
-#define ULOG_NL_EVENT 111 /* Harald's favorite number */
-#define ULOG_MAXNLGROUPS 32 /* numer of nlgroups */
-
-static unsigned int nlbufsiz = NLMSG_GOODSIZE;
-module_param(nlbufsiz, uint, 0400);
-MODULE_PARM_DESC(nlbufsiz, "netlink buffer size");
-
-static unsigned int flushtimeout = 10;
-module_param(flushtimeout, uint, 0600);
-MODULE_PARM_DESC(flushtimeout, "buffer flush timeout (hundredths of a second)");
-
-static bool nflog = true;
-module_param(nflog, bool, 0400);
-MODULE_PARM_DESC(nflog, "register as internal netfilter logging module");
-
-/* global data structures */
-
-typedef struct {
- unsigned int qlen; /* number of nlmsgs' in the skb */
- struct nlmsghdr *lastnlh; /* netlink header of last msg in skb */
- struct sk_buff *skb; /* the pre-allocated skb */
- struct timer_list timer; /* the timer function */
-} ulog_buff_t;
-
-static int ulog_net_id __read_mostly;
-struct ulog_net {
- unsigned int nlgroup[ULOG_MAXNLGROUPS];
- ulog_buff_t ulog_buffers[ULOG_MAXNLGROUPS];
- struct sock *nflognl;
- spinlock_t lock;
-};
-
-static struct ulog_net *ulog_pernet(struct net *net)
-{
- return net_generic(net, ulog_net_id);
-}
-
-/* send one ulog_buff_t to userspace */
-static void ulog_send(struct ulog_net *ulog, unsigned int nlgroupnum)
-{
- ulog_buff_t *ub = &ulog->ulog_buffers[nlgroupnum];
-
- pr_debug("ulog_send: timer is deleting\n");
- del_timer(&ub->timer);
-
- if (!ub->skb) {
- pr_debug("ulog_send: nothing to send\n");
- return;
- }
-
- /* last nlmsg needs NLMSG_DONE */
- if (ub->qlen > 1)
- ub->lastnlh->nlmsg_type = NLMSG_DONE;
-
- NETLINK_CB(ub->skb).dst_group = nlgroupnum + 1;
- pr_debug("throwing %d packets to netlink group %u\n",
- ub->qlen, nlgroupnum + 1);
- netlink_broadcast(ulog->nflognl, ub->skb, 0, nlgroupnum + 1,
- GFP_ATOMIC);
-
- ub->qlen = 0;
- ub->skb = NULL;
- ub->lastnlh = NULL;
-}
-
-
-/* timer function to flush queue in flushtimeout time */
-static void ulog_timer(unsigned long data)
-{
- unsigned int groupnum = *((unsigned int *)data);
- struct ulog_net *ulog = container_of((void *)data,
- struct ulog_net,
- nlgroup[groupnum]);
- pr_debug("timer function called, calling ulog_send\n");
-
- /* lock to protect against somebody modifying our structure
- * from ipt_ulog_target at the same time */
- spin_lock_bh(&ulog->lock);
- ulog_send(ulog, groupnum);
- spin_unlock_bh(&ulog->lock);
-}
-
-static struct sk_buff *ulog_alloc_skb(unsigned int size)
-{
- struct sk_buff *skb;
- unsigned int n;
-
- /* alloc skb which should be big enough for a whole
- * multipart message. WARNING: has to be <= 131000
- * due to slab allocator restrictions */
-
- n = max(size, nlbufsiz);
- skb = alloc_skb(n, GFP_ATOMIC | __GFP_NOWARN);
- if (!skb) {
- if (n > size) {
- /* try to allocate only as much as we need for
- * current packet */
-
- skb = alloc_skb(size, GFP_ATOMIC);
- if (!skb)
- pr_debug("cannot even allocate %ub\n", size);
- }
- }
-
- return skb;
-}
-
-static void ipt_ulog_packet(struct net *net,
- unsigned int hooknum,
- const struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- const struct ipt_ulog_info *loginfo,
- const char *prefix)
-{
- ulog_buff_t *ub;
- ulog_packet_msg_t *pm;
- size_t size, copy_len;
- struct nlmsghdr *nlh;
- struct timeval tv;
- struct ulog_net *ulog = ulog_pernet(net);
-
- /* ffs == find first bit set, necessary because userspace
- * is already shifting groupnumber, but we need unshifted.
- * ffs() returns [1..32], we need [0..31] */
- unsigned int groupnum = ffs(loginfo->nl_group) - 1;
-
- /* calculate the size of the skb needed */
- if (loginfo->copy_range == 0 || loginfo->copy_range > skb->len)
- copy_len = skb->len;
- else
- copy_len = loginfo->copy_range;
-
- size = nlmsg_total_size(sizeof(*pm) + copy_len);
-
- ub = &ulog->ulog_buffers[groupnum];
-
- spin_lock_bh(&ulog->lock);
-
- if (!ub->skb) {
- if (!(ub->skb = ulog_alloc_skb(size)))
- goto alloc_failure;
- } else if (ub->qlen >= loginfo->qthreshold ||
- size > skb_tailroom(ub->skb)) {
- /* either the queue len is too high or we don't have
- * enough room in nlskb left. send it to userspace. */
-
- ulog_send(ulog, groupnum);
-
- if (!(ub->skb = ulog_alloc_skb(size)))
- goto alloc_failure;
- }
-
- pr_debug("qlen %d, qthreshold %Zu\n", ub->qlen, loginfo->qthreshold);
-
- nlh = nlmsg_put(ub->skb, 0, ub->qlen, ULOG_NL_EVENT,
- sizeof(*pm)+copy_len, 0);
- if (!nlh) {
- pr_debug("error during nlmsg_put\n");
- goto out_unlock;
- }
- ub->qlen++;
-
- pm = nlmsg_data(nlh);
- memset(pm, 0, sizeof(*pm));
-
- /* We might not have a timestamp, get one */
- if (skb->tstamp.tv64 == 0)
- __net_timestamp((struct sk_buff *)skb);
-
- /* copy hook, prefix, timestamp, payload, etc. */
- pm->data_len = copy_len;
- tv = ktime_to_timeval(skb->tstamp);
- put_unaligned(tv.tv_sec, &pm->timestamp_sec);
- put_unaligned(tv.tv_usec, &pm->timestamp_usec);
- put_unaligned(skb->mark, &pm->mark);
- pm->hook = hooknum;
- if (prefix != NULL) {
- strncpy(pm->prefix, prefix, sizeof(pm->prefix) - 1);
- pm->prefix[sizeof(pm->prefix) - 1] = '\0';
- }
- else if (loginfo->prefix[0] != '\0')
- strncpy(pm->prefix, loginfo->prefix, sizeof(pm->prefix));
-
- if (in && in->hard_header_len > 0 &&
- skb->mac_header != skb->network_header &&
- in->hard_header_len <= ULOG_MAC_LEN) {
- memcpy(pm->mac, skb_mac_header(skb), in->hard_header_len);
- pm->mac_len = in->hard_header_len;
- } else
- pm->mac_len = 0;
-
- if (in)
- strncpy(pm->indev_name, in->name, sizeof(pm->indev_name));
-
- if (out)
- strncpy(pm->outdev_name, out->name, sizeof(pm->outdev_name));
-
- /* copy_len <= skb->len, so can't fail. */
- if (skb_copy_bits(skb, 0, pm->payload, copy_len) < 0)
- BUG();
-
- /* check if we are building multi-part messages */
- if (ub->qlen > 1)
- ub->lastnlh->nlmsg_flags |= NLM_F_MULTI;
-
- ub->lastnlh = nlh;
-
- /* if timer isn't already running, start it */
- if (!timer_pending(&ub->timer)) {
- ub->timer.expires = jiffies + flushtimeout * HZ / 100;
- add_timer(&ub->timer);
- }
-
- /* if threshold is reached, send message to userspace */
- if (ub->qlen >= loginfo->qthreshold) {
- if (loginfo->qthreshold > 1)
- nlh->nlmsg_type = NLMSG_DONE;
- ulog_send(ulog, groupnum);
- }
-out_unlock:
- spin_unlock_bh(&ulog->lock);
-
- return;
-
-alloc_failure:
- pr_debug("Error building netlink message\n");
- spin_unlock_bh(&ulog->lock);
-}
-
-static unsigned int
-ulog_tg(struct sk_buff *skb, const struct xt_action_param *par)
-{
- struct net *net = dev_net(par->in ? par->in : par->out);
-
- ipt_ulog_packet(net, par->hooknum, skb, par->in, par->out,
- par->targinfo, NULL);
- return XT_CONTINUE;
-}
-
-static void ipt_logfn(struct net *net,
- u_int8_t pf,
- unsigned int hooknum,
- const struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- const struct nf_loginfo *li,
- const char *prefix)
-{
- struct ipt_ulog_info loginfo;
-
- if (!li || li->type != NF_LOG_TYPE_ULOG) {
- loginfo.nl_group = ULOG_DEFAULT_NLGROUP;
- loginfo.copy_range = 0;
- loginfo.qthreshold = ULOG_DEFAULT_QTHRESHOLD;
- loginfo.prefix[0] = '\0';
- } else {
- loginfo.nl_group = li->u.ulog.group;
- loginfo.copy_range = li->u.ulog.copy_len;
- loginfo.qthreshold = li->u.ulog.qthreshold;
- strlcpy(loginfo.prefix, prefix, sizeof(loginfo.prefix));
- }
-
- ipt_ulog_packet(net, hooknum, skb, in, out, &loginfo, prefix);
-}
-
-static int ulog_tg_check(const struct xt_tgchk_param *par)
-{
- const struct ipt_ulog_info *loginfo = par->targinfo;
-
- if (!par->net->xt.ulog_warn_deprecated) {
- pr_info("ULOG is deprecated and it will be removed soon, "
- "use NFLOG instead\n");
- par->net->xt.ulog_warn_deprecated = true;
- }
-
- if (loginfo->prefix[sizeof(loginfo->prefix) - 1] != '\0') {
- pr_debug("prefix not null-terminated\n");
- return -EINVAL;
- }
- if (loginfo->qthreshold > ULOG_MAX_QLEN) {
- pr_debug("queue threshold %Zu > MAX_QLEN\n",
- loginfo->qthreshold);
- return -EINVAL;
- }
- return 0;
-}
-
-#ifdef CONFIG_COMPAT
-struct compat_ipt_ulog_info {
- compat_uint_t nl_group;
- compat_size_t copy_range;
- compat_size_t qthreshold;
- char prefix[ULOG_PREFIX_LEN];
-};
-
-static void ulog_tg_compat_from_user(void *dst, const void *src)
-{
- const struct compat_ipt_ulog_info *cl = src;
- struct ipt_ulog_info l = {
- .nl_group = cl->nl_group,
- .copy_range = cl->copy_range,
- .qthreshold = cl->qthreshold,
- };
-
- memcpy(l.prefix, cl->prefix, sizeof(l.prefix));
- memcpy(dst, &l, sizeof(l));
-}
-
-static int ulog_tg_compat_to_user(void __user *dst, const void *src)
-{
- const struct ipt_ulog_info *l = src;
- struct compat_ipt_ulog_info cl = {
- .nl_group = l->nl_group,
- .copy_range = l->copy_range,
- .qthreshold = l->qthreshold,
- };
-
- memcpy(cl.prefix, l->prefix, sizeof(cl.prefix));
- return copy_to_user(dst, &cl, sizeof(cl)) ? -EFAULT : 0;
-}
-#endif /* CONFIG_COMPAT */
-
-static struct xt_target ulog_tg_reg __read_mostly = {
- .name = "ULOG",
- .family = NFPROTO_IPV4,
- .target = ulog_tg,
- .targetsize = sizeof(struct ipt_ulog_info),
- .checkentry = ulog_tg_check,
-#ifdef CONFIG_COMPAT
- .compatsize = sizeof(struct compat_ipt_ulog_info),
- .compat_from_user = ulog_tg_compat_from_user,
- .compat_to_user = ulog_tg_compat_to_user,
-#endif
- .me = THIS_MODULE,
-};
-
-static struct nf_logger ipt_ulog_logger __read_mostly = {
- .name = "ipt_ULOG",
- .logfn = ipt_logfn,
- .me = THIS_MODULE,
-};
-
-static int __net_init ulog_tg_net_init(struct net *net)
-{
- int i;
- struct ulog_net *ulog = ulog_pernet(net);
- struct netlink_kernel_cfg cfg = {
- .groups = ULOG_MAXNLGROUPS,
- };
-
- spin_lock_init(&ulog->lock);
- /* initialize ulog_buffers */
- for (i = 0; i < ULOG_MAXNLGROUPS; i++) {
- ulog->nlgroup[i] = i;
- setup_timer(&ulog->ulog_buffers[i].timer, ulog_timer,
- (unsigned long)&ulog->nlgroup[i]);
- }
-
- ulog->nflognl = netlink_kernel_create(net, NETLINK_NFLOG, &cfg);
- if (!ulog->nflognl)
- return -ENOMEM;
-
- if (nflog)
- nf_log_set(net, NFPROTO_IPV4, &ipt_ulog_logger);
-
- return 0;
-}
-
-static void __net_exit ulog_tg_net_exit(struct net *net)
-{
- ulog_buff_t *ub;
- int i;
- struct ulog_net *ulog = ulog_pernet(net);
-
- if (nflog)
- nf_log_unset(net, &ipt_ulog_logger);
-
- netlink_kernel_release(ulog->nflognl);
-
- /* remove pending timers and free allocated skb's */
- for (i = 0; i < ULOG_MAXNLGROUPS; i++) {
- ub = &ulog->ulog_buffers[i];
- pr_debug("timer is deleting\n");
- del_timer(&ub->timer);
-
- if (ub->skb) {
- kfree_skb(ub->skb);
- ub->skb = NULL;
- }
- }
-}
-
-static struct pernet_operations ulog_tg_net_ops = {
- .init = ulog_tg_net_init,
- .exit = ulog_tg_net_exit,
- .id = &ulog_net_id,
- .size = sizeof(struct ulog_net),
-};
-
-static int __init ulog_tg_init(void)
-{
- int ret;
- pr_debug("init module\n");
-
- if (nlbufsiz > 128*1024) {
- pr_warn("Netlink buffer has to be <= 128kB\n");
- return -EINVAL;
- }
-
- ret = register_pernet_subsys(&ulog_tg_net_ops);
- if (ret)
- goto out_pernet;
-
- ret = xt_register_target(&ulog_tg_reg);
- if (ret < 0)
- goto out_target;
-
- if (nflog)
- nf_log_register(NFPROTO_IPV4, &ipt_ulog_logger);
-
- return 0;
-
-out_target:
- unregister_pernet_subsys(&ulog_tg_net_ops);
-out_pernet:
- return ret;
-}
-
-static void __exit ulog_tg_exit(void)
-{
- pr_debug("cleanup_module\n");
- if (nflog)
- nf_log_unregister(&ipt_ulog_logger);
- xt_unregister_target(&ulog_tg_reg);
- unregister_pernet_subsys(&ulog_tg_net_ops);
-}
-
-module_init(ulog_tg_init);
-module_exit(ulog_tg_exit);
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 8127dc802865..a054fe083431 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -314,7 +314,7 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len)
return -ENOENT;
}
-#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nfnetlink_conntrack.h>
@@ -358,7 +358,7 @@ static struct nf_sockopt_ops so_getorigdst = {
.pf = PF_INET,
.get_optmin = SO_ORIGINAL_DST,
.get_optmax = SO_ORIGINAL_DST+1,
- .get = &getorigdst,
+ .get = getorigdst,
.owner = THIS_MODULE,
};
@@ -388,7 +388,7 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = {
.invert_tuple = ipv4_invert_tuple,
.print_tuple = ipv4_print_tuple,
.get_l4proto = ipv4_get_l4proto,
-#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
.tuple_to_nlattr = ipv4_tuple_to_nlattr,
.nlattr_tuple_size = ipv4_nlattr_tuple_size,
.nlattr_to_tuple = ipv4_nlattr_to_tuple,
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index a338dad41b7d..b91b2641adda 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -226,7 +226,7 @@ icmp_error(struct net *net, struct nf_conn *tmpl,
return icmp_error_message(net, tmpl, skb, ctinfo, hooknum);
}
-#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nfnetlink_conntrack.h>
@@ -408,7 +408,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly =
.error = icmp_error,
.destroy = NULL,
.me = NULL,
-#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
.tuple_to_nlattr = icmp_tuple_to_nlattr,
.nlattr_tuple_size = icmp_nlattr_tuple_size,
.nlattr_to_tuple = icmp_nlattr_to_tuple,
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index b8f6381c7d0b..76bd1aef257f 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -17,7 +17,7 @@
#include <linux/netfilter_bridge.h>
#include <linux/netfilter_ipv4.h>
#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
-#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
#include <net/netfilter/nf_conntrack.h>
#endif
#include <net/netfilter/nf_conntrack_zones.h>
@@ -45,7 +45,7 @@ static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum,
{
u16 zone = NF_CT_DEFAULT_ZONE;
-#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
if (skb->nfct)
zone = nf_ct_zone((struct nf_conn *)skb->nfct);
#endif
@@ -74,8 +74,8 @@ static unsigned int ipv4_conntrack_defrag(const struct nf_hook_ops *ops,
inet->nodefrag)
return NF_ACCEPT;
-#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
-#if !defined(CONFIG_NF_NAT) && !defined(CONFIG_NF_NAT_MODULE)
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+#if !IS_ENABLED(CONFIG_NF_NAT)
/* Previously seen (loopback)? Ignore. Do this before
fragment check. */
if (skb->nfct && !nf_ct_is_template((struct nf_conn *)skb->nfct))
diff --git a/net/ipv4/netfilter/nf_log_arp.c b/net/ipv4/netfilter/nf_log_arp.c
new file mode 100644
index 000000000000..ccfc78db12ee
--- /dev/null
+++ b/net/ipv4/netfilter/nf_log_arp.c
@@ -0,0 +1,149 @@
+/*
+ * (C) 2014 by Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * Based on code from ebt_log from:
+ *
+ * Bart De Schuymer <bdschuym@pandora.be>
+ * Harald Welte <laforge@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/skbuff.h>
+#include <linux/if_arp.h>
+#include <linux/ip.h>
+#include <net/route.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/xt_LOG.h>
+#include <net/netfilter/nf_log.h>
+
+static struct nf_loginfo default_loginfo = {
+ .type = NF_LOG_TYPE_LOG,
+ .u = {
+ .log = {
+ .level = 5,
+ .logflags = NF_LOG_MASK,
+ },
+ },
+};
+
+struct arppayload {
+ unsigned char mac_src[ETH_ALEN];
+ unsigned char ip_src[4];
+ unsigned char mac_dst[ETH_ALEN];
+ unsigned char ip_dst[4];
+};
+
+static void dump_arp_packet(struct nf_log_buf *m,
+ const struct nf_loginfo *info,
+ const struct sk_buff *skb, unsigned int nhoff)
+{
+ const struct arphdr *ah;
+ struct arphdr _arph;
+ const struct arppayload *ap;
+ struct arppayload _arpp;
+
+ ah = skb_header_pointer(skb, 0, sizeof(_arph), &_arph);
+ if (ah == NULL) {
+ nf_log_buf_add(m, "TRUNCATED");
+ return;
+ }
+ nf_log_buf_add(m, "ARP HTYPE=%d PTYPE=0x%04x OPCODE=%d",
+ ntohs(ah->ar_hrd), ntohs(ah->ar_pro), ntohs(ah->ar_op));
+
+ /* If it's for Ethernet and the lengths are OK, then log the ARP
+ * payload.
+ */
+ if (ah->ar_hrd != htons(1) ||
+ ah->ar_hln != ETH_ALEN ||
+ ah->ar_pln != sizeof(__be32))
+ return;
+
+ ap = skb_header_pointer(skb, sizeof(_arph), sizeof(_arpp), &_arpp);
+ if (ap == NULL) {
+ nf_log_buf_add(m, " INCOMPLETE [%Zu bytes]",
+ skb->len - sizeof(_arph));
+ return;
+ }
+ nf_log_buf_add(m, " MACSRC=%pM IPSRC=%pI4 MACDST=%pM IPDST=%pI4",
+ ap->mac_src, ap->ip_src, ap->mac_dst, ap->ip_dst);
+}
+
+void nf_log_arp_packet(struct net *net, u_int8_t pf,
+ unsigned int hooknum, const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const struct nf_loginfo *loginfo,
+ const char *prefix)
+{
+ struct nf_log_buf *m;
+
+ /* FIXME: Disabled from containers until syslog ns is supported */
+ if (!net_eq(net, &init_net))
+ return;
+
+ m = nf_log_buf_open();
+
+ if (!loginfo)
+ loginfo = &default_loginfo;
+
+ nf_log_dump_packet_common(m, pf, hooknum, skb, in, out, loginfo,
+ prefix);
+ dump_arp_packet(m, loginfo, skb, 0);
+
+ nf_log_buf_close(m);
+}
+
+static struct nf_logger nf_arp_logger __read_mostly = {
+ .name = "nf_log_arp",
+ .type = NF_LOG_TYPE_LOG,
+ .logfn = nf_log_arp_packet,
+ .me = THIS_MODULE,
+};
+
+static int __net_init nf_log_arp_net_init(struct net *net)
+{
+ nf_log_set(net, NFPROTO_ARP, &nf_arp_logger);
+ return 0;
+}
+
+static void __net_exit nf_log_arp_net_exit(struct net *net)
+{
+ nf_log_unset(net, &nf_arp_logger);
+}
+
+static struct pernet_operations nf_log_arp_net_ops = {
+ .init = nf_log_arp_net_init,
+ .exit = nf_log_arp_net_exit,
+};
+
+static int __init nf_log_arp_init(void)
+{
+ int ret;
+
+ ret = register_pernet_subsys(&nf_log_arp_net_ops);
+ if (ret < 0)
+ return ret;
+
+ nf_log_register(NFPROTO_ARP, &nf_arp_logger);
+ return 0;
+}
+
+static void __exit nf_log_arp_exit(void)
+{
+ unregister_pernet_subsys(&nf_log_arp_net_ops);
+ nf_log_unregister(&nf_arp_logger);
+}
+
+module_init(nf_log_arp_init);
+module_exit(nf_log_arp_exit);
+
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_DESCRIPTION("Netfilter ARP packet logging");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NF_LOGGER(3, 0);
diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c
new file mode 100644
index 000000000000..078bdca1b607
--- /dev/null
+++ b/net/ipv4/netfilter/nf_log_ipv4.c
@@ -0,0 +1,385 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/skbuff.h>
+#include <linux/if_arp.h>
+#include <linux/ip.h>
+#include <net/ipv6.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/tcp.h>
+#include <net/route.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/xt_LOG.h>
+#include <net/netfilter/nf_log.h>
+
+static struct nf_loginfo default_loginfo = {
+ .type = NF_LOG_TYPE_LOG,
+ .u = {
+ .log = {
+ .level = 5,
+ .logflags = NF_LOG_MASK,
+ },
+ },
+};
+
+/* One level of recursion won't kill us */
+static void dump_ipv4_packet(struct nf_log_buf *m,
+ const struct nf_loginfo *info,
+ const struct sk_buff *skb, unsigned int iphoff)
+{
+ struct iphdr _iph;
+ const struct iphdr *ih;
+ unsigned int logflags;
+
+ if (info->type == NF_LOG_TYPE_LOG)
+ logflags = info->u.log.logflags;
+ else
+ logflags = NF_LOG_MASK;
+
+ ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph);
+ if (ih == NULL) {
+ nf_log_buf_add(m, "TRUNCATED");
+ return;
+ }
+
+ /* Important fields:
+ * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */
+ /* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */
+ nf_log_buf_add(m, "SRC=%pI4 DST=%pI4 ", &ih->saddr, &ih->daddr);
+
+ /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */
+ nf_log_buf_add(m, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ",
+ ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK,
+ ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id));
+
+ /* Max length: 6 "CE DF MF " */
+ if (ntohs(ih->frag_off) & IP_CE)
+ nf_log_buf_add(m, "CE ");
+ if (ntohs(ih->frag_off) & IP_DF)
+ nf_log_buf_add(m, "DF ");
+ if (ntohs(ih->frag_off) & IP_MF)
+ nf_log_buf_add(m, "MF ");
+
+ /* Max length: 11 "FRAG:65535 " */
+ if (ntohs(ih->frag_off) & IP_OFFSET)
+ nf_log_buf_add(m, "FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET);
+
+ if ((logflags & XT_LOG_IPOPT) &&
+ ih->ihl * 4 > sizeof(struct iphdr)) {
+ const unsigned char *op;
+ unsigned char _opt[4 * 15 - sizeof(struct iphdr)];
+ unsigned int i, optsize;
+
+ optsize = ih->ihl * 4 - sizeof(struct iphdr);
+ op = skb_header_pointer(skb, iphoff+sizeof(_iph),
+ optsize, _opt);
+ if (op == NULL) {
+ nf_log_buf_add(m, "TRUNCATED");
+ return;
+ }
+
+ /* Max length: 127 "OPT (" 15*4*2chars ") " */
+ nf_log_buf_add(m, "OPT (");
+ for (i = 0; i < optsize; i++)
+ nf_log_buf_add(m, "%02X", op[i]);
+ nf_log_buf_add(m, ") ");
+ }
+
+ switch (ih->protocol) {
+ case IPPROTO_TCP:
+ if (nf_log_dump_tcp_header(m, skb, ih->protocol,
+ ntohs(ih->frag_off) & IP_OFFSET,
+ iphoff+ih->ihl*4, logflags))
+ return;
+ break;
+ case IPPROTO_UDP:
+ case IPPROTO_UDPLITE:
+ if (nf_log_dump_udp_header(m, skb, ih->protocol,
+ ntohs(ih->frag_off) & IP_OFFSET,
+ iphoff+ih->ihl*4))
+ return;
+ break;
+ case IPPROTO_ICMP: {
+ struct icmphdr _icmph;
+ const struct icmphdr *ich;
+ static const size_t required_len[NR_ICMP_TYPES+1]
+ = { [ICMP_ECHOREPLY] = 4,
+ [ICMP_DEST_UNREACH]
+ = 8 + sizeof(struct iphdr),
+ [ICMP_SOURCE_QUENCH]
+ = 8 + sizeof(struct iphdr),
+ [ICMP_REDIRECT]
+ = 8 + sizeof(struct iphdr),
+ [ICMP_ECHO] = 4,
+ [ICMP_TIME_EXCEEDED]
+ = 8 + sizeof(struct iphdr),
+ [ICMP_PARAMETERPROB]
+ = 8 + sizeof(struct iphdr),
+ [ICMP_TIMESTAMP] = 20,
+ [ICMP_TIMESTAMPREPLY] = 20,
+ [ICMP_ADDRESS] = 12,
+ [ICMP_ADDRESSREPLY] = 12 };
+
+ /* Max length: 11 "PROTO=ICMP " */
+ nf_log_buf_add(m, "PROTO=ICMP ");
+
+ if (ntohs(ih->frag_off) & IP_OFFSET)
+ break;
+
+ /* Max length: 25 "INCOMPLETE [65535 bytes] " */
+ ich = skb_header_pointer(skb, iphoff + ih->ihl * 4,
+ sizeof(_icmph), &_icmph);
+ if (ich == NULL) {
+ nf_log_buf_add(m, "INCOMPLETE [%u bytes] ",
+ skb->len - iphoff - ih->ihl*4);
+ break;
+ }
+
+ /* Max length: 18 "TYPE=255 CODE=255 " */
+ nf_log_buf_add(m, "TYPE=%u CODE=%u ", ich->type, ich->code);
+
+ /* Max length: 25 "INCOMPLETE [65535 bytes] " */
+ if (ich->type <= NR_ICMP_TYPES &&
+ required_len[ich->type] &&
+ skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) {
+ nf_log_buf_add(m, "INCOMPLETE [%u bytes] ",
+ skb->len - iphoff - ih->ihl*4);
+ break;
+ }
+
+ switch (ich->type) {
+ case ICMP_ECHOREPLY:
+ case ICMP_ECHO:
+ /* Max length: 19 "ID=65535 SEQ=65535 " */
+ nf_log_buf_add(m, "ID=%u SEQ=%u ",
+ ntohs(ich->un.echo.id),
+ ntohs(ich->un.echo.sequence));
+ break;
+
+ case ICMP_PARAMETERPROB:
+ /* Max length: 14 "PARAMETER=255 " */
+ nf_log_buf_add(m, "PARAMETER=%u ",
+ ntohl(ich->un.gateway) >> 24);
+ break;
+ case ICMP_REDIRECT:
+ /* Max length: 24 "GATEWAY=255.255.255.255 " */
+ nf_log_buf_add(m, "GATEWAY=%pI4 ", &ich->un.gateway);
+ /* Fall through */
+ case ICMP_DEST_UNREACH:
+ case ICMP_SOURCE_QUENCH:
+ case ICMP_TIME_EXCEEDED:
+ /* Max length: 3+maxlen */
+ if (!iphoff) { /* Only recurse once. */
+ nf_log_buf_add(m, "[");
+ dump_ipv4_packet(m, info, skb,
+ iphoff + ih->ihl*4+sizeof(_icmph));
+ nf_log_buf_add(m, "] ");
+ }
+
+ /* Max length: 10 "MTU=65535 " */
+ if (ich->type == ICMP_DEST_UNREACH &&
+ ich->code == ICMP_FRAG_NEEDED) {
+ nf_log_buf_add(m, "MTU=%u ",
+ ntohs(ich->un.frag.mtu));
+ }
+ }
+ break;
+ }
+ /* Max Length */
+ case IPPROTO_AH: {
+ struct ip_auth_hdr _ahdr;
+ const struct ip_auth_hdr *ah;
+
+ if (ntohs(ih->frag_off) & IP_OFFSET)
+ break;
+
+ /* Max length: 9 "PROTO=AH " */
+ nf_log_buf_add(m, "PROTO=AH ");
+
+ /* Max length: 25 "INCOMPLETE [65535 bytes] " */
+ ah = skb_header_pointer(skb, iphoff+ih->ihl*4,
+ sizeof(_ahdr), &_ahdr);
+ if (ah == NULL) {
+ nf_log_buf_add(m, "INCOMPLETE [%u bytes] ",
+ skb->len - iphoff - ih->ihl*4);
+ break;
+ }
+
+ /* Length: 15 "SPI=0xF1234567 " */
+ nf_log_buf_add(m, "SPI=0x%x ", ntohl(ah->spi));
+ break;
+ }
+ case IPPROTO_ESP: {
+ struct ip_esp_hdr _esph;
+ const struct ip_esp_hdr *eh;
+
+ /* Max length: 10 "PROTO=ESP " */
+ nf_log_buf_add(m, "PROTO=ESP ");
+
+ if (ntohs(ih->frag_off) & IP_OFFSET)
+ break;
+
+ /* Max length: 25 "INCOMPLETE [65535 bytes] " */
+ eh = skb_header_pointer(skb, iphoff+ih->ihl*4,
+ sizeof(_esph), &_esph);
+ if (eh == NULL) {
+ nf_log_buf_add(m, "INCOMPLETE [%u bytes] ",
+ skb->len - iphoff - ih->ihl*4);
+ break;
+ }
+
+ /* Length: 15 "SPI=0xF1234567 " */
+ nf_log_buf_add(m, "SPI=0x%x ", ntohl(eh->spi));
+ break;
+ }
+ /* Max length: 10 "PROTO 255 " */
+ default:
+ nf_log_buf_add(m, "PROTO=%u ", ih->protocol);
+ }
+
+ /* Max length: 15 "UID=4294967295 " */
+ if ((logflags & XT_LOG_UID) && !iphoff)
+ nf_log_dump_sk_uid_gid(m, skb->sk);
+
+ /* Max length: 16 "MARK=0xFFFFFFFF " */
+ if (!iphoff && skb->mark)
+ nf_log_buf_add(m, "MARK=0x%x ", skb->mark);
+
+ /* Proto Max log string length */
+ /* IP: 40+46+6+11+127 = 230 */
+ /* TCP: 10+max(25,20+30+13+9+32+11+127) = 252 */
+ /* UDP: 10+max(25,20) = 35 */
+ /* UDPLITE: 14+max(25,20) = 39 */
+ /* ICMP: 11+max(25, 18+25+max(19,14,24+3+n+10,3+n+10)) = 91+n */
+ /* ESP: 10+max(25)+15 = 50 */
+ /* AH: 9+max(25)+15 = 49 */
+ /* unknown: 10 */
+
+ /* (ICMP allows recursion one level deep) */
+ /* maxlen = IP + ICMP + IP + max(TCP,UDP,ICMP,unknown) */
+ /* maxlen = 230+ 91 + 230 + 252 = 803 */
+}
+
+static void dump_ipv4_mac_header(struct nf_log_buf *m,
+ const struct nf_loginfo *info,
+ const struct sk_buff *skb)
+{
+ struct net_device *dev = skb->dev;
+ unsigned int logflags = 0;
+
+ if (info->type == NF_LOG_TYPE_LOG)
+ logflags = info->u.log.logflags;
+
+ if (!(logflags & XT_LOG_MACDECODE))
+ goto fallback;
+
+ switch (dev->type) {
+ case ARPHRD_ETHER:
+ nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ",
+ eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
+ ntohs(eth_hdr(skb)->h_proto));
+ return;
+ default:
+ break;
+ }
+
+fallback:
+ nf_log_buf_add(m, "MAC=");
+ if (dev->hard_header_len &&
+ skb->mac_header != skb->network_header) {
+ const unsigned char *p = skb_mac_header(skb);
+ unsigned int i;
+
+ nf_log_buf_add(m, "%02x", *p++);
+ for (i = 1; i < dev->hard_header_len; i++, p++)
+ nf_log_buf_add(m, ":%02x", *p);
+ }
+ nf_log_buf_add(m, " ");
+}
+
+static void nf_log_ip_packet(struct net *net, u_int8_t pf,
+ unsigned int hooknum, const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const struct nf_loginfo *loginfo,
+ const char *prefix)
+{
+ struct nf_log_buf *m;
+
+ /* FIXME: Disabled from containers until syslog ns is supported */
+ if (!net_eq(net, &init_net))
+ return;
+
+ m = nf_log_buf_open();
+
+ if (!loginfo)
+ loginfo = &default_loginfo;
+
+ nf_log_dump_packet_common(m, pf, hooknum, skb, in,
+ out, loginfo, prefix);
+
+ if (in != NULL)
+ dump_ipv4_mac_header(m, loginfo, skb);
+
+ dump_ipv4_packet(m, loginfo, skb, 0);
+
+ nf_log_buf_close(m);
+}
+
+static struct nf_logger nf_ip_logger __read_mostly = {
+ .name = "nf_log_ipv4",
+ .type = NF_LOG_TYPE_LOG,
+ .logfn = nf_log_ip_packet,
+ .me = THIS_MODULE,
+};
+
+static int __net_init nf_log_ipv4_net_init(struct net *net)
+{
+ nf_log_set(net, NFPROTO_IPV4, &nf_ip_logger);
+ return 0;
+}
+
+static void __net_exit nf_log_ipv4_net_exit(struct net *net)
+{
+ nf_log_unset(net, &nf_ip_logger);
+}
+
+static struct pernet_operations nf_log_ipv4_net_ops = {
+ .init = nf_log_ipv4_net_init,
+ .exit = nf_log_ipv4_net_exit,
+};
+
+static int __init nf_log_ipv4_init(void)
+{
+ int ret;
+
+ ret = register_pernet_subsys(&nf_log_ipv4_net_ops);
+ if (ret < 0)
+ return ret;
+
+ nf_log_register(NFPROTO_IPV4, &nf_ip_logger);
+ return 0;
+}
+
+static void __exit nf_log_ipv4_exit(void)
+{
+ unregister_pernet_subsys(&nf_log_ipv4_net_ops);
+ nf_log_unregister(&nf_ip_logger);
+}
+
+module_init(nf_log_ipv4_init);
+module_exit(nf_log_ipv4_exit);
+
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("Netfilter IPv4 packet logging");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NF_LOGGER(AF_INET, 0);
diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
index d8b2e14efddc..14f5ccd06337 100644
--- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
@@ -154,6 +154,7 @@ static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb,
htons(oldlen), htons(datalen), 1);
}
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[],
struct nf_nat_range *range)
{
@@ -169,6 +170,7 @@ static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[],
return 0;
}
+#endif
static const struct nf_nat_l3proto nf_nat_l3proto_ipv4 = {
.l3proto = NFPROTO_IPV4,
@@ -177,7 +179,9 @@ static const struct nf_nat_l3proto nf_nat_l3proto_ipv4 = {
.manip_pkt = nf_nat_ipv4_manip_pkt,
.csum_update = nf_nat_ipv4_csum_update,
.csum_recalc = nf_nat_ipv4_csum_recalc,
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
.nlattr_to_range = nf_nat_ipv4_nlattr_to_range,
+#endif
#ifdef CONFIG_XFRM
.decode_session = nf_nat_ipv4_decode_session,
#endif
diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c
index 690d890111bb..9414923f1e15 100644
--- a/net/ipv4/netfilter/nf_nat_proto_gre.c
+++ b/net/ipv4/netfilter/nf_nat_proto_gre.c
@@ -124,7 +124,7 @@ static const struct nf_nat_l4proto gre = {
.manip_pkt = gre_manip_pkt,
.in_range = nf_nat_l4proto_in_range,
.unique_tuple = gre_unique_tuple,
-#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
.nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
#endif
};
diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c b/net/ipv4/netfilter/nf_nat_proto_icmp.c
index eb303471bcf6..4557b4ab8342 100644
--- a/net/ipv4/netfilter/nf_nat_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_icmp.c
@@ -77,7 +77,7 @@ const struct nf_nat_l4proto nf_nat_l4proto_icmp = {
.manip_pkt = icmp_manip_pkt,
.in_range = icmp_in_range,
.unique_tuple = icmp_unique_tuple,
-#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
.nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
#endif
};
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 044a0ddf6a79..a3c59a077a5f 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -911,7 +911,7 @@ int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
sin6->sin6_flowinfo = ip6_flowinfo(ip6);
sin6->sin6_scope_id =
ipv6_iface_scope_id(&sin6->sin6_addr,
- IP6CB(skb)->iif);
+ inet6_iif(skb));
*addr_len = sizeof(*sin6);
}
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index ae0af9386f7c..8e3eb39f84e7 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -52,6 +52,7 @@
static int sockstat_seq_show(struct seq_file *seq, void *v)
{
struct net *net = seq->private;
+ unsigned int frag_mem;
int orphans, sockets;
local_bh_disable();
@@ -71,8 +72,8 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
sock_prot_inuse_get(net, &udplite_prot));
seq_printf(seq, "RAW: inuse %d\n",
sock_prot_inuse_get(net, &raw_prot));
- seq_printf(seq, "FRAG: inuse %d memory %d\n",
- ip_frag_nqueues(net), ip_frag_mem(net));
+ frag_mem = ip_frag_mem(net);
+ seq_printf(seq, "FRAG: inuse %u memory %u\n", !!frag_mem, frag_mem);
return 0;
}
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 2c65160565e1..739db3100c23 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -58,6 +58,7 @@
#include <linux/in_route.h>
#include <linux/route.h>
#include <linux/skbuff.h>
+#include <linux/igmp.h>
#include <net/net_namespace.h>
#include <net/dst.h>
#include <net/sock.h>
@@ -174,7 +175,9 @@ static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash)
while (sk) {
delivered = 1;
- if (iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) {
+ if ((iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) &&
+ ip_mc_sf_allow(sk, iph->daddr, iph->saddr,
+ skb->dev->ifindex)) {
struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC);
/* Not releasing hash table! */
@@ -365,6 +368,8 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
skb->ip_summed = CHECKSUM_NONE;
+ sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
+
skb->transport_header = skb->network_header;
err = -EFAULT;
if (memcpy_fromiovecend((void *)iph, from, 0, length))
@@ -606,6 +611,8 @@ back_from_confirm:
&rt, msg->msg_flags);
else {
+ sock_tx_timestamp(sk, &ipc.tx_flags);
+
if (!ipc.addr)
ipc.addr = fl4.daddr;
lock_sock(sk);
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index c86624b36a62..c0c75688896e 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -170,7 +170,8 @@ u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th,
}
EXPORT_SYMBOL_GPL(__cookie_v4_init_sequence);
-__u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp)
+__u32 cookie_v4_init_sequence(struct sock *sk, const struct sk_buff *skb,
+ __u16 *mssp)
{
const struct iphdr *iph = ip_hdr(skb);
const struct tcphdr *th = tcp_hdr(skb);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 9d2118e5fbc7..744af67a5989 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -426,6 +426,15 @@ void tcp_init_sock(struct sock *sk)
}
EXPORT_SYMBOL(tcp_init_sock);
+void tcp_tx_timestamp(struct sock *sk, struct sk_buff *skb)
+{
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+
+ sock_tx_timestamp(sk, &shinfo->tx_flags);
+ if (shinfo->tx_flags & SKBTX_ANY_SW_TSTAMP)
+ shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;
+}
+
/*
* Wait for a TCP event.
*
@@ -523,7 +532,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
}
/* This barrier is coupled with smp_wmb() in tcp_reset() */
smp_rmb();
- if (sk->sk_err)
+ if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
mask |= POLLERR;
return mask;
@@ -959,8 +968,10 @@ new_segment:
copied += copy;
offset += copy;
- if (!(size -= copy))
+ if (!(size -= copy)) {
+ tcp_tx_timestamp(sk, skb);
goto out;
+ }
if (skb->len < size_goal || (flags & MSG_OOB))
continue;
@@ -1252,8 +1263,10 @@ new_segment:
from += copy;
copied += copy;
- if ((seglen -= copy) == 0 && iovlen == 0)
+ if ((seglen -= copy) == 0 && iovlen == 0) {
+ tcp_tx_timestamp(sk, skb);
goto out;
+ }
if (skb->len < max || (flags & MSG_OOB) || unlikely(tp->repair))
continue;
@@ -1617,6 +1630,9 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
struct sk_buff *skb;
u32 urg_hole = 0;
+ if (unlikely(flags & MSG_ERRQUEUE))
+ return ip_recv_error(sk, msg, len, addr_len);
+
if (sk_can_busy_loop(sk) && skb_queue_empty(&sk->sk_receive_queue) &&
(sk->sk_state == TCP_ESTABLISHED))
sk_busy_loop(sk, nonblock);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 40639c288dc2..a3d47af01906 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -74,6 +74,7 @@
#include <linux/ipsec.h>
#include <asm/unaligned.h>
#include <net/netdma.h>
+#include <linux/errqueue.h>
int sysctl_tcp_timestamps __read_mostly = 1;
int sysctl_tcp_window_scaling __read_mostly = 1;
@@ -1904,16 +1905,17 @@ void tcp_clear_retrans(struct tcp_sock *tp)
tp->sacked_out = 0;
}
-/* Enter Loss state. If "how" is not zero, forget all SACK information
+/* Enter Loss state. If we detect SACK reneging, forget all SACK information
* and reset tags completely, otherwise preserve SACKs. If receiver
* dropped its ofo queue, we will know this due to reneging detection.
*/
-void tcp_enter_loss(struct sock *sk, int how)
+void tcp_enter_loss(struct sock *sk)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
bool new_recovery = false;
+ bool is_reneg; /* is receiver reneging on SACKs? */
/* Reduce ssthresh if it has not yet been made inside this window. */
if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
@@ -1934,7 +1936,11 @@ void tcp_enter_loss(struct sock *sk, int how)
tcp_reset_reno_sack(tp);
tp->undo_marker = tp->snd_una;
- if (how) {
+
+ skb = tcp_write_queue_head(sk);
+ is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED);
+ if (is_reneg) {
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
tp->sacked_out = 0;
tp->fackets_out = 0;
}
@@ -1948,7 +1954,7 @@ void tcp_enter_loss(struct sock *sk, int how)
tp->undo_marker = 0;
TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED;
- if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || how) {
+ if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || is_reneg) {
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
tp->lost_out += tcp_skb_pcount(skb);
@@ -1981,19 +1987,21 @@ void tcp_enter_loss(struct sock *sk, int how)
* remembered SACKs do not reflect real state of receiver i.e.
* receiver _host_ is heavily congested (or buggy).
*
- * Do processing similar to RTO timeout.
+ * To avoid big spurious retransmission bursts due to transient SACK
+ * scoreboard oddities that look like reneging, we give the receiver a
+ * little time (max(RTT/2, 10ms)) to send us some more ACKs that will
+ * restore sanity to the SACK scoreboard. If the apparent reneging
+ * persists until this RTO then we'll clear the SACK scoreboard.
*/
static bool tcp_check_sack_reneging(struct sock *sk, int flag)
{
if (flag & FLAG_SACK_RENEGING) {
- struct inet_connection_sock *icsk = inet_csk(sk);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
+ struct tcp_sock *tp = tcp_sk(sk);
+ unsigned long delay = max(usecs_to_jiffies(tp->srtt_us >> 4),
+ msecs_to_jiffies(10));
- tcp_enter_loss(sk, 1);
- icsk->icsk_retransmits++;
- tcp_retransmit_skb(sk, tcp_write_queue_head(sk));
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
- icsk->icsk_rto, TCP_RTO_MAX);
+ delay, TCP_RTO_MAX);
return true;
}
return false;
@@ -2475,7 +2483,7 @@ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)
* losses and/or application stalls), do not perform any further cwnd
* reductions, but instead slow start up to ssthresh.
*/
-static void tcp_init_cwnd_reduction(struct sock *sk, const bool set_ssthresh)
+static void tcp_init_cwnd_reduction(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -2485,8 +2493,7 @@ static void tcp_init_cwnd_reduction(struct sock *sk, const bool set_ssthresh)
tp->prior_cwnd = tp->snd_cwnd;
tp->prr_delivered = 0;
tp->prr_out = 0;
- if (set_ssthresh)
- tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
+ tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
TCP_ECN_queue_cwr(tp);
}
@@ -2528,14 +2535,14 @@ static inline void tcp_end_cwnd_reduction(struct sock *sk)
}
/* Enter CWR state. Disable cwnd undo since congestion is proven with ECN */
-void tcp_enter_cwr(struct sock *sk, const int set_ssthresh)
+void tcp_enter_cwr(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
tp->prior_ssthresh = 0;
if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
tp->undo_marker = 0;
- tcp_init_cwnd_reduction(sk, set_ssthresh);
+ tcp_init_cwnd_reduction(sk);
tcp_set_ca_state(sk, TCP_CA_CWR);
}
}
@@ -2564,7 +2571,7 @@ static void tcp_try_to_open(struct sock *sk, int flag, const int prior_unsacked)
tp->retrans_stamp = 0;
if (flag & FLAG_ECE)
- tcp_enter_cwr(sk, 1);
+ tcp_enter_cwr(sk);
if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
tcp_try_keep_open(sk);
@@ -2670,7 +2677,7 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
if (!ece_ack)
tp->prior_ssthresh = tcp_current_ssthresh(sk);
- tcp_init_cwnd_reduction(sk, true);
+ tcp_init_cwnd_reduction(sk);
}
tcp_set_ca_state(sk, TCP_CA_Recovery);
}
@@ -3100,6 +3107,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
tp->retrans_stamp = 0;
}
+ if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_ACK_TSTAMP) &&
+ between(skb_shinfo(skb)->tskey, prior_snd_una,
+ tp->snd_una + 1))
+ __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK);
+
if (!fully_acked)
break;
@@ -3346,7 +3358,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
tp->tlp_high_seq = 0;
/* Don't reduce cwnd if DSACK arrives for TLP retrans. */
if (!(flag & FLAG_DSACKING_ACK)) {
- tcp_init_cwnd_reduction(sk, true);
+ tcp_init_cwnd_reduction(sk);
tcp_set_ca_state(sk, TCP_CA_CWR);
tcp_end_cwnd_reduction(sk);
tcp_try_keep_open(sk);
@@ -5877,3 +5889,153 @@ discard:
return 0;
}
EXPORT_SYMBOL(tcp_rcv_state_process);
+
+static inline void pr_drop_req(struct request_sock *req, __u16 port, int family)
+{
+ struct inet_request_sock *ireq = inet_rsk(req);
+
+ if (family == AF_INET)
+ LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),
+ &ireq->ir_rmt_addr, port);
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (family == AF_INET6)
+ LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI6/%u\n"),
+ &ireq->ir_v6_rmt_addr, port);
+#endif
+}
+
+int tcp_conn_request(struct request_sock_ops *rsk_ops,
+ const struct tcp_request_sock_ops *af_ops,
+ struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_options_received tmp_opt;
+ struct request_sock *req;
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct dst_entry *dst = NULL;
+ __u32 isn = TCP_SKB_CB(skb)->when;
+ bool want_cookie = false, fastopen;
+ struct flowi fl;
+ struct tcp_fastopen_cookie foc = { .len = -1 };
+ int err;
+
+
+ /* TW buckets are converted to open requests without
+ * limitations, they conserve resources and peer is
+ * evidently real one.
+ */
+ if ((sysctl_tcp_syncookies == 2 ||
+ inet_csk_reqsk_queue_is_full(sk)) && !isn) {
+ want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name);
+ if (!want_cookie)
+ goto drop;
+ }
+
+
+ /* Accept backlog is full. If we have already queued enough
+ * of warm entries in syn queue, drop request. It is better than
+ * clogging syn queue with openreqs with exponentially increasing
+ * timeout.
+ */
+ if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+ goto drop;
+ }
+
+ req = inet_reqsk_alloc(rsk_ops);
+ if (!req)
+ goto drop;
+
+ tcp_rsk(req)->af_specific = af_ops;
+
+ tcp_clear_options(&tmp_opt);
+ tmp_opt.mss_clamp = af_ops->mss_clamp;
+ tmp_opt.user_mss = tp->rx_opt.user_mss;
+ tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc);
+
+ if (want_cookie && !tmp_opt.saw_tstamp)
+ tcp_clear_options(&tmp_opt);
+
+ tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
+ tcp_openreq_init(req, &tmp_opt, skb, sk);
+
+ af_ops->init_req(req, sk, skb);
+
+ if (security_inet_conn_request(sk, skb, req))
+ goto drop_and_free;
+
+ if (!want_cookie || tmp_opt.tstamp_ok)
+ TCP_ECN_create_request(req, skb, sock_net(sk));
+
+ if (want_cookie) {
+ isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
+ req->cookie_ts = tmp_opt.tstamp_ok;
+ } else if (!isn) {
+ /* VJ's idea. We save last timestamp seen
+ * from the destination in peer table, when entering
+ * state TIME-WAIT, and check against it before
+ * accepting new connection request.
+ *
+ * If "isn" is not zero, this request hit alive
+ * timewait bucket, so that all the necessary checks
+ * are made in the function processing timewait state.
+ */
+ if (tmp_opt.saw_tstamp && tcp_death_row.sysctl_tw_recycle) {
+ bool strict;
+
+ dst = af_ops->route_req(sk, &fl, req, &strict);
+ if (dst && strict &&
+ !tcp_peer_is_proven(req, dst, true)) {
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
+ goto drop_and_release;
+ }
+ }
+ /* Kill the following clause, if you dislike this way. */
+ else if (!sysctl_tcp_syncookies &&
+ (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
+ (sysctl_max_syn_backlog >> 2)) &&
+ !tcp_peer_is_proven(req, dst, false)) {
+ /* Without syncookies last quarter of
+ * backlog is filled with destinations,
+ * proven to be alive.
+ * It means that we continue to communicate
+ * to destinations, already remembered
+ * to the moment of synflood.
+ */
+ pr_drop_req(req, ntohs(tcp_hdr(skb)->source),
+ rsk_ops->family);
+ goto drop_and_release;
+ }
+
+ isn = af_ops->init_seq(skb);
+ }
+ if (!dst) {
+ dst = af_ops->route_req(sk, &fl, req, NULL);
+ if (!dst)
+ goto drop_and_free;
+ }
+
+ tcp_rsk(req)->snt_isn = isn;
+ tcp_openreq_init_rwin(req, sk, dst);
+ fastopen = !want_cookie &&
+ tcp_try_fastopen(sk, skb, req, &foc, dst);
+ err = af_ops->send_synack(sk, dst, &fl, req,
+ skb_get_queue_mapping(skb), &foc);
+ if (!fastopen) {
+ if (err || want_cookie)
+ goto drop_and_free;
+
+ tcp_rsk(req)->listener = NULL;
+ af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
+ }
+
+ return 0;
+
+drop_and_release:
+ dst_release(dst);
+drop_and_free:
+ reqsk_free(req);
+drop:
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
+ return 0;
+}
+EXPORT_SYMBOL(tcp_conn_request);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 77cccda1ad0c..992a1f926009 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -99,7 +99,7 @@ static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
struct inet_hashinfo tcp_hashinfo;
EXPORT_SYMBOL(tcp_hashinfo);
-static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
+static __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
{
return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
ip_hdr(skb)->saddr,
@@ -208,6 +208,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
inet->inet_dport = usin->sin_port;
inet->inet_daddr = daddr;
+ inet_set_txhash(sk);
+
inet_csk(sk)->icsk_ext_hdr_len = 0;
if (inet_opt)
inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
@@ -342,11 +344,6 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
int err;
struct net *net = dev_net(icmp_skb->dev);
- if (icmp_skb->len < (iph->ihl << 2) + 8) {
- ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
- return;
- }
-
sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
iph->saddr, th->source, inet_iif(icmp_skb));
if (!sk) {
@@ -814,6 +811,7 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
* socket.
*/
static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
+ struct flowi *fl,
struct request_sock *req,
u16 queue_mapping,
struct tcp_fastopen_cookie *foc)
@@ -837,24 +835,11 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
ireq->ir_rmt_addr,
ireq->opt);
err = net_xmit_eval(err);
- if (!tcp_rsk(req)->snt_synack && !err)
- tcp_rsk(req)->snt_synack = tcp_time_stamp;
}
return err;
}
-static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req)
-{
- int res = tcp_v4_send_synack(sk, NULL, req, 0, NULL);
-
- if (!res) {
- TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
- }
- return res;
-}
-
/*
* IPv4 request_sock destructor.
*/
@@ -1064,7 +1049,7 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
if (sin->sin_family != AF_INET)
return -EINVAL;
- if (!cmd.tcpm_key || !cmd.tcpm_keylen)
+ if (!cmd.tcpm_keylen)
return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
AF_INET);
@@ -1237,161 +1222,68 @@ static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
#endif
+static void tcp_v4_init_req(struct request_sock *req, struct sock *sk,
+ struct sk_buff *skb)
+{
+ struct inet_request_sock *ireq = inet_rsk(req);
+
+ ireq->ir_loc_addr = ip_hdr(skb)->daddr;
+ ireq->ir_rmt_addr = ip_hdr(skb)->saddr;
+ ireq->no_srccheck = inet_sk(sk)->transparent;
+ ireq->opt = tcp_v4_save_options(skb);
+}
+
+static struct dst_entry *tcp_v4_route_req(struct sock *sk, struct flowi *fl,
+ const struct request_sock *req,
+ bool *strict)
+{
+ struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
+
+ if (strict) {
+ if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
+ *strict = true;
+ else
+ *strict = false;
+ }
+
+ return dst;
+}
+
struct request_sock_ops tcp_request_sock_ops __read_mostly = {
.family = PF_INET,
.obj_size = sizeof(struct tcp_request_sock),
- .rtx_syn_ack = tcp_v4_rtx_synack,
+ .rtx_syn_ack = tcp_rtx_synack,
.send_ack = tcp_v4_reqsk_send_ack,
.destructor = tcp_v4_reqsk_destructor,
.send_reset = tcp_v4_send_reset,
.syn_ack_timeout = tcp_syn_ack_timeout,
};
-#ifdef CONFIG_TCP_MD5SIG
static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
+ .mss_clamp = TCP_MSS_DEFAULT,
+#ifdef CONFIG_TCP_MD5SIG
.md5_lookup = tcp_v4_reqsk_md5_lookup,
.calc_md5_hash = tcp_v4_md5_hash_skb,
-};
#endif
+ .init_req = tcp_v4_init_req,
+#ifdef CONFIG_SYN_COOKIES
+ .cookie_init_seq = cookie_v4_init_sequence,
+#endif
+ .route_req = tcp_v4_route_req,
+ .init_seq = tcp_v4_init_sequence,
+ .send_synack = tcp_v4_send_synack,
+ .queue_hash_add = inet_csk_reqsk_queue_hash_add,
+};
int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
{
- struct tcp_options_received tmp_opt;
- struct request_sock *req;
- struct inet_request_sock *ireq;
- struct tcp_sock *tp = tcp_sk(sk);
- struct dst_entry *dst = NULL;
- __be32 saddr = ip_hdr(skb)->saddr;
- __be32 daddr = ip_hdr(skb)->daddr;
- __u32 isn = TCP_SKB_CB(skb)->when;
- bool want_cookie = false, fastopen;
- struct flowi4 fl4;
- struct tcp_fastopen_cookie foc = { .len = -1 };
- int err;
-
/* Never answer to SYNs send to broadcast or multicast */
if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
goto drop;
- /* TW buckets are converted to open requests without
- * limitations, they conserve resources and peer is
- * evidently real one.
- */
- if ((sysctl_tcp_syncookies == 2 ||
- inet_csk_reqsk_queue_is_full(sk)) && !isn) {
- want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
- if (!want_cookie)
- goto drop;
- }
-
- /* Accept backlog is full. If we have already queued enough
- * of warm entries in syn queue, drop request. It is better than
- * clogging syn queue with openreqs with exponentially increasing
- * timeout.
- */
- if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
- goto drop;
- }
-
- req = inet_reqsk_alloc(&tcp_request_sock_ops);
- if (!req)
- goto drop;
-
-#ifdef CONFIG_TCP_MD5SIG
- tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
-#endif
-
- tcp_clear_options(&tmp_opt);
- tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
- tmp_opt.user_mss = tp->rx_opt.user_mss;
- tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc);
-
- if (want_cookie && !tmp_opt.saw_tstamp)
- tcp_clear_options(&tmp_opt);
-
- tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
- tcp_openreq_init(req, &tmp_opt, skb);
-
- ireq = inet_rsk(req);
- ireq->ir_loc_addr = daddr;
- ireq->ir_rmt_addr = saddr;
- ireq->no_srccheck = inet_sk(sk)->transparent;
- ireq->opt = tcp_v4_save_options(skb);
- ireq->ir_mark = inet_request_mark(sk, skb);
-
- if (security_inet_conn_request(sk, skb, req))
- goto drop_and_free;
-
- if (!want_cookie || tmp_opt.tstamp_ok)
- TCP_ECN_create_request(req, skb, sock_net(sk));
-
- if (want_cookie) {
- isn = cookie_v4_init_sequence(sk, skb, &req->mss);
- req->cookie_ts = tmp_opt.tstamp_ok;
- } else if (!isn) {
- /* VJ's idea. We save last timestamp seen
- * from the destination in peer table, when entering
- * state TIME-WAIT, and check against it before
- * accepting new connection request.
- *
- * If "isn" is not zero, this request hit alive
- * timewait bucket, so that all the necessary checks
- * are made in the function processing timewait state.
- */
- if (tmp_opt.saw_tstamp &&
- tcp_death_row.sysctl_tw_recycle &&
- (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
- fl4.daddr == saddr) {
- if (!tcp_peer_is_proven(req, dst, true)) {
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
- goto drop_and_release;
- }
- }
- /* Kill the following clause, if you dislike this way. */
- else if (!sysctl_tcp_syncookies &&
- (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
- (sysctl_max_syn_backlog >> 2)) &&
- !tcp_peer_is_proven(req, dst, false)) {
- /* Without syncookies last quarter of
- * backlog is filled with destinations,
- * proven to be alive.
- * It means that we continue to communicate
- * to destinations, already remembered
- * to the moment of synflood.
- */
- LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),
- &saddr, ntohs(tcp_hdr(skb)->source));
- goto drop_and_release;
- }
-
- isn = tcp_v4_init_sequence(skb);
- }
- if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
- goto drop_and_free;
-
- tcp_rsk(req)->snt_isn = isn;
- tcp_rsk(req)->snt_synack = tcp_time_stamp;
- tcp_openreq_init_rwin(req, sk, dst);
- fastopen = !want_cookie &&
- tcp_try_fastopen(sk, skb, req, &foc, dst);
- err = tcp_v4_send_synack(sk, dst, req,
- skb_get_queue_mapping(skb), &foc);
- if (!fastopen) {
- if (err || want_cookie)
- goto drop_and_free;
-
- tcp_rsk(req)->snt_synack = tcp_time_stamp;
- tcp_rsk(req)->listener = NULL;
- inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
- }
-
- return 0;
+ return tcp_conn_request(&tcp_request_sock_ops,
+ &tcp_request_sock_ipv4_ops, sk, skb);
-drop_and_release:
- dst_release(dst);
-drop_and_free:
- reqsk_free(req);
drop:
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
return 0;
@@ -1439,6 +1331,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
newinet->mc_ttl = ip_hdr(skb)->ttl;
newinet->rcv_tos = ip_hdr(skb)->tos;
inet_csk(newsk)->icsk_ext_hdr_len = 0;
+ inet_set_txhash(newsk);
if (inet_opt)
inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
newinet->inet_id = newtp->write_seq ^ jiffies;
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 4fe041805989..0d54e59b9ea8 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -1093,7 +1093,6 @@ static const struct genl_ops tcp_metrics_nl_ops[] = {
.doit = tcp_metrics_nl_cmd_get,
.dumpit = tcp_metrics_nl_dump,
.policy = tcp_metrics_nl_policy,
- .flags = GENL_ADMIN_PERM,
},
{
.cmd = TCP_METRICS_CMD_DEL,
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index e68e0d4af6c9..1649988bd1b6 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -298,7 +298,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
tw->tw_tclass = np->tclass;
tw->tw_flowlabel = np->flow_label >> 12;
- tw->tw_ipv6only = np->ipv6only;
+ tw->tw_ipv6only = sk->sk_ipv6only;
}
#endif
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 55046ecd083e..f597119fc4e7 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -14,6 +14,21 @@
#include <net/tcp.h>
#include <net/protocol.h>
+void tcp_gso_tstamp(struct sk_buff *skb, unsigned int ts_seq, unsigned int seq,
+ unsigned int mss)
+{
+ while (skb) {
+ if (ts_seq < (__u64) seq + mss) {
+ skb_shinfo(skb)->tx_flags = SKBTX_SW_TSTAMP;
+ skb_shinfo(skb)->tskey = ts_seq;
+ return;
+ }
+
+ skb = skb->next;
+ seq += mss;
+ }
+}
+
struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
netdev_features_t features)
{
@@ -91,6 +106,9 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
th = tcp_hdr(skb);
seq = ntohl(th->seq);
+ if (unlikely(skb_shinfo(gso_skb)->tx_flags & SKBTX_SW_TSTAMP))
+ tcp_gso_tstamp(segs, skb_shinfo(gso_skb)->tskey, seq, mss);
+
newcheck = ~csum_fold((__force __wsum)((__force u32)th->check +
(__force u32)delta));
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 179b51e6bda3..8fcfc91964ec 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -916,6 +916,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
skb_orphan(skb);
skb->sk = sk;
skb->destructor = tcp_wfree;
+ skb_set_hash_from_sk(skb, sk);
atomic_add(skb->truesize, &sk->sk_wmem_alloc);
/* Build TCP header and checksum it. */
@@ -978,7 +979,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
if (likely(err <= 0))
return err;
- tcp_enter_cwr(sk, 1);
+ tcp_enter_cwr(sk);
return net_xmit_eval(err);
}
@@ -3301,3 +3302,18 @@ void tcp_send_probe0(struct sock *sk)
TCP_RTO_MAX);
}
}
+
+int tcp_rtx_synack(struct sock *sk, struct request_sock *req)
+{
+ const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific;
+ struct flowi fl;
+ int res;
+
+ res = af_ops->send_synack(sk, NULL, &fl, req, 0, NULL);
+ if (!res) {
+ TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
+ }
+ return res;
+}
+EXPORT_SYMBOL(tcp_rtx_synack);
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 286227abed10..df90cd1ce37f 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -391,7 +391,7 @@ void tcp_retransmit_timer(struct sock *sk)
tcp_write_err(sk);
goto out;
}
- tcp_enter_loss(sk, 0);
+ tcp_enter_loss(sk);
tcp_retransmit_skb(sk, tcp_write_queue_head(sk));
__sk_dst_reset(sk);
goto out_reset_timer;
@@ -422,7 +422,7 @@ void tcp_retransmit_timer(struct sock *sk)
NET_INC_STATS_BH(sock_net(sk), mib_idx);
}
- tcp_enter_loss(sk, 0);
+ tcp_enter_loss(sk);
if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk)) > 0) {
/* Retransmission failed because of local congestion,
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index 9a5e05f27f4f..b40ad897f945 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -218,7 +218,8 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked)
* This is:
* (actual rate in segments) * baseRTT
*/
- target_cwnd = tp->snd_cwnd * vegas->baseRTT / rtt;
+ target_cwnd = (u64)tp->snd_cwnd * vegas->baseRTT;
+ do_div(target_cwnd, rtt);
/* Calculate the difference between the window we had,
* and the window we would like to have. This quantity
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index 27b9825753d1..8276977d2c85 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -144,7 +144,7 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
rtt = veno->minrtt;
- target_cwnd = (tp->snd_cwnd * veno->basertt);
+ target_cwnd = (u64)tp->snd_cwnd * veno->basertt;
target_cwnd <<= V_PARAM_SHIFT;
do_div(target_cwnd, rtt);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 7d5a8661df76..f57c0e4c2326 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -594,27 +594,6 @@ static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk,
return true;
}
-static inline struct sock *udp_v4_mcast_next(struct net *net, struct sock *sk,
- __be16 loc_port, __be32 loc_addr,
- __be16 rmt_port, __be32 rmt_addr,
- int dif)
-{
- struct hlist_nulls_node *node;
- struct sock *s = sk;
- unsigned short hnum = ntohs(loc_port);
-
- sk_nulls_for_each_from(s, node) {
- if (__udp_is_mcast_sock(net, s,
- loc_port, loc_addr,
- rmt_port, rmt_addr,
- dif, hnum))
- goto found;
- }
- s = NULL;
-found:
- return s;
-}
-
/*
* This routine is called by the ICMP module when it gets some
* sort of error condition. If err < 0 then the socket should
@@ -1588,7 +1567,7 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
goto csum_error;
- if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) {
+ if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS,
is_udplite);
goto drop;
@@ -1640,6 +1619,8 @@ static void flush_stack(struct sock **stack, unsigned int count,
if (skb1 && udp_queue_rcv_skb(sk, skb1) <= 0)
skb1 = NULL;
+
+ sock_put(sk);
}
if (unlikely(skb1))
kfree_skb(skb1);
@@ -1668,41 +1649,50 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
struct udp_table *udptable)
{
struct sock *sk, *stack[256 / sizeof(struct sock *)];
- struct udp_hslot *hslot = udp_hashslot(udptable, net, ntohs(uh->dest));
- int dif;
- unsigned int i, count = 0;
+ struct hlist_nulls_node *node;
+ unsigned short hnum = ntohs(uh->dest);
+ struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum);
+ int dif = skb->dev->ifindex;
+ unsigned int count = 0, offset = offsetof(typeof(*sk), sk_nulls_node);
+ unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10);
+
+ if (use_hash2) {
+ hash2_any = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum) &
+ udp_table.mask;
+ hash2 = udp4_portaddr_hash(net, daddr, hnum) & udp_table.mask;
+start_lookup:
+ hslot = &udp_table.hash2[hash2];
+ offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
+ }
spin_lock(&hslot->lock);
- sk = sk_nulls_head(&hslot->head);
- dif = skb->dev->ifindex;
- sk = udp_v4_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif);
- while (sk) {
- stack[count++] = sk;
- sk = udp_v4_mcast_next(net, sk_nulls_next(sk), uh->dest,
- daddr, uh->source, saddr, dif);
- if (unlikely(count == ARRAY_SIZE(stack))) {
- if (!sk)
- break;
- flush_stack(stack, count, skb, ~0);
- count = 0;
+ sk_nulls_for_each_entry_offset(sk, node, &hslot->head, offset) {
+ if (__udp_is_mcast_sock(net, sk,
+ uh->dest, daddr,
+ uh->source, saddr,
+ dif, hnum)) {
+ if (unlikely(count == ARRAY_SIZE(stack))) {
+ flush_stack(stack, count, skb, ~0);
+ count = 0;
+ }
+ stack[count++] = sk;
+ sock_hold(sk);
}
}
- /*
- * before releasing chain lock, we must take a reference on sockets
- */
- for (i = 0; i < count; i++)
- sock_hold(stack[i]);
spin_unlock(&hslot->lock);
+ /* Also lookup *:port if we are using hash2 and haven't done so yet. */
+ if (use_hash2 && hash2 != hash2_any) {
+ hash2 = hash2_any;
+ goto start_lookup;
+ }
+
/*
* do the slow work with no lock held
*/
if (count) {
flush_stack(stack, count, skb, count - 1);
-
- for (i = 0; i < count; i++)
- sock_put(stack[i]);
} else {
kfree_skb(skb);
}
@@ -2526,79 +2516,3 @@ void __init udp_init(void)
sysctl_udp_rmem_min = SK_MEM_QUANTUM;
sysctl_udp_wmem_min = SK_MEM_QUANTUM;
}
-
-struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
- netdev_features_t features)
-{
- struct sk_buff *segs = ERR_PTR(-EINVAL);
- u16 mac_offset = skb->mac_header;
- int mac_len = skb->mac_len;
- int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
- __be16 protocol = skb->protocol;
- netdev_features_t enc_features;
- int udp_offset, outer_hlen;
- unsigned int oldlen;
- bool need_csum;
-
- oldlen = (u16)~skb->len;
-
- if (unlikely(!pskb_may_pull(skb, tnl_hlen)))
- goto out;
-
- skb->encapsulation = 0;
- __skb_pull(skb, tnl_hlen);
- skb_reset_mac_header(skb);
- skb_set_network_header(skb, skb_inner_network_offset(skb));
- skb->mac_len = skb_inner_network_offset(skb);
- skb->protocol = htons(ETH_P_TEB);
-
- need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM);
- if (need_csum)
- skb->encap_hdr_csum = 1;
-
- /* segment inner packet. */
- enc_features = skb->dev->hw_enc_features & netif_skb_features(skb);
- segs = skb_mac_gso_segment(skb, enc_features);
- if (!segs || IS_ERR(segs)) {
- skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset,
- mac_len);
- goto out;
- }
-
- outer_hlen = skb_tnl_header_len(skb);
- udp_offset = outer_hlen - tnl_hlen;
- skb = segs;
- do {
- struct udphdr *uh;
- int len;
-
- skb_reset_inner_headers(skb);
- skb->encapsulation = 1;
-
- skb->mac_len = mac_len;
-
- skb_push(skb, outer_hlen);
- skb_reset_mac_header(skb);
- skb_set_network_header(skb, mac_len);
- skb_set_transport_header(skb, udp_offset);
- len = skb->len - udp_offset;
- uh = udp_hdr(skb);
- uh->len = htons(len);
-
- if (need_csum) {
- __be32 delta = htonl(oldlen + len);
-
- uh->check = ~csum_fold((__force __wsum)
- ((__force u32)uh->check +
- (__force u32)delta));
- uh->check = gso_make_checksum(skb, ~uh->check);
-
- if (uh->check == 0)
- uh->check = CSUM_MANGLED_0;
- }
-
- skb->protocol = protocol;
- } while ((skb = skb->next));
-out:
- return segs;
-}
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 546d2d439dda..59035bc3008d 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -47,6 +47,82 @@ static int udp4_ufo_send_check(struct sk_buff *skb)
return 0;
}
+struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ struct sk_buff *segs = ERR_PTR(-EINVAL);
+ u16 mac_offset = skb->mac_header;
+ int mac_len = skb->mac_len;
+ int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
+ __be16 protocol = skb->protocol;
+ netdev_features_t enc_features;
+ int udp_offset, outer_hlen;
+ unsigned int oldlen;
+ bool need_csum;
+
+ oldlen = (u16)~skb->len;
+
+ if (unlikely(!pskb_may_pull(skb, tnl_hlen)))
+ goto out;
+
+ skb->encapsulation = 0;
+ __skb_pull(skb, tnl_hlen);
+ skb_reset_mac_header(skb);
+ skb_set_network_header(skb, skb_inner_network_offset(skb));
+ skb->mac_len = skb_inner_network_offset(skb);
+ skb->protocol = htons(ETH_P_TEB);
+
+ need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM);
+ if (need_csum)
+ skb->encap_hdr_csum = 1;
+
+ /* segment inner packet. */
+ enc_features = skb->dev->hw_enc_features & netif_skb_features(skb);
+ segs = skb_mac_gso_segment(skb, enc_features);
+ if (IS_ERR_OR_NULL(segs)) {
+ skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset,
+ mac_len);
+ goto out;
+ }
+
+ outer_hlen = skb_tnl_header_len(skb);
+ udp_offset = outer_hlen - tnl_hlen;
+ skb = segs;
+ do {
+ struct udphdr *uh;
+ int len;
+
+ skb_reset_inner_headers(skb);
+ skb->encapsulation = 1;
+
+ skb->mac_len = mac_len;
+
+ skb_push(skb, outer_hlen);
+ skb_reset_mac_header(skb);
+ skb_set_network_header(skb, mac_len);
+ skb_set_transport_header(skb, udp_offset);
+ len = skb->len - udp_offset;
+ uh = udp_hdr(skb);
+ uh->len = htons(len);
+
+ if (need_csum) {
+ __be32 delta = htonl(oldlen + len);
+
+ uh->check = ~csum_fold((__force __wsum)
+ ((__force u32)uh->check +
+ (__force u32)delta));
+ uh->check = gso_make_checksum(skb, ~uh->check);
+
+ if (uh->check == 0)
+ uh->check = CSUM_MANGLED_0;
+ }
+
+ skb->protocol = protocol;
+ } while ((skb = skb->next));
+out:
+ return segs;
+}
+
static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
netdev_features_t features)
{
diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c
new file mode 100644
index 000000000000..61ec1a65207e
--- /dev/null
+++ b/net/ipv4/udp_tunnel.c
@@ -0,0 +1,100 @@
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/socket.h>
+#include <linux/udp.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <net/udp.h>
+#include <net/udp_tunnel.h>
+#include <net/net_namespace.h>
+
+int udp_sock_create(struct net *net, struct udp_port_cfg *cfg,
+ struct socket **sockp)
+{
+ int err = -EINVAL;
+ struct socket *sock = NULL;
+
+#if IS_ENABLED(CONFIG_IPV6)
+ if (cfg->family == AF_INET6) {
+ struct sockaddr_in6 udp6_addr;
+
+ err = sock_create_kern(AF_INET6, SOCK_DGRAM, 0, &sock);
+ if (err < 0)
+ goto error;
+
+ sk_change_net(sock->sk, net);
+
+ udp6_addr.sin6_family = AF_INET6;
+ memcpy(&udp6_addr.sin6_addr, &cfg->local_ip6,
+ sizeof(udp6_addr.sin6_addr));
+ udp6_addr.sin6_port = cfg->local_udp_port;
+ err = kernel_bind(sock, (struct sockaddr *)&udp6_addr,
+ sizeof(udp6_addr));
+ if (err < 0)
+ goto error;
+
+ if (cfg->peer_udp_port) {
+ udp6_addr.sin6_family = AF_INET6;
+ memcpy(&udp6_addr.sin6_addr, &cfg->peer_ip6,
+ sizeof(udp6_addr.sin6_addr));
+ udp6_addr.sin6_port = cfg->peer_udp_port;
+ err = kernel_connect(sock,
+ (struct sockaddr *)&udp6_addr,
+ sizeof(udp6_addr), 0);
+ }
+ if (err < 0)
+ goto error;
+
+ udp_set_no_check6_tx(sock->sk, !cfg->use_udp6_tx_checksums);
+ udp_set_no_check6_rx(sock->sk, !cfg->use_udp6_rx_checksums);
+ } else
+#endif
+ if (cfg->family == AF_INET) {
+ struct sockaddr_in udp_addr;
+
+ err = sock_create_kern(AF_INET, SOCK_DGRAM, 0, &sock);
+ if (err < 0)
+ goto error;
+
+ sk_change_net(sock->sk, net);
+
+ udp_addr.sin_family = AF_INET;
+ udp_addr.sin_addr = cfg->local_ip;
+ udp_addr.sin_port = cfg->local_udp_port;
+ err = kernel_bind(sock, (struct sockaddr *)&udp_addr,
+ sizeof(udp_addr));
+ if (err < 0)
+ goto error;
+
+ if (cfg->peer_udp_port) {
+ udp_addr.sin_family = AF_INET;
+ udp_addr.sin_addr = cfg->peer_ip;
+ udp_addr.sin_port = cfg->peer_udp_port;
+ err = kernel_connect(sock,
+ (struct sockaddr *)&udp_addr,
+ sizeof(udp_addr), 0);
+ if (err < 0)
+ goto error;
+ }
+
+ sock->sk->sk_no_check_tx = !cfg->use_udp_checksums;
+ } else {
+ return -EPFNOSUPPORT;
+ }
+
+
+ *sockp = sock;
+
+ return 0;
+
+error:
+ if (sock) {
+ kernel_sock_shutdown(sock, SHUT_RDWR);
+ sk_release_kernel(sock->sk);
+ }
+ *sockp = NULL;
+ return err;
+}
+EXPORT_SYMBOL(udp_sock_create);
+
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/xfrm4_protocol.c b/net/ipv4/xfrm4_protocol.c
index a2ce0101eaac..dccefa9d84cf 100644
--- a/net/ipv4/xfrm4_protocol.c
+++ b/net/ipv4/xfrm4_protocol.c
@@ -124,7 +124,7 @@ static int xfrm4_ah_rcv(struct sk_buff *skb)
for_each_protocol_rcu(ah4_handlers, handler)
if ((ret = handler->handler(skb)) != -EINVAL)
- return ret;;
+ return ret;
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);