summaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
authorMichal Marek <mmarek@suse.cz>2010-12-14 22:01:55 +0100
committerMichal Marek <mmarek@suse.cz>2010-12-14 22:01:55 +0100
commit8990c1bc4be46473ad19bf2fa612ca57286f3df4 (patch)
tree3cea60576903a1d26c67e6ec62891b524d390e95 /net/ipv4
parent2979076fbf17a0947d6eba367b0cac19c907c160 (diff)
parentc8ddb2713c624f432fa5fe3c7ecffcdda46ea0d4 (diff)
downloadlinux-8990c1bc4be46473ad19bf2fa612ca57286f3df4.tar.gz
linux-8990c1bc4be46473ad19bf2fa612ca57286f3df4.tar.bz2
linux-8990c1bc4be46473ad19bf2fa612ca57286f3df4.zip
Merge commit 'v2.6.37-rc1' into kbuild/kbuild
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig15
-rw-r--r--net/ipv4/Makefile1
-rw-r--r--net/ipv4/af_inet.c92
-rw-r--r--net/ipv4/arp.c294
-rw-r--r--net/ipv4/cipso_ipv4.c2
-rw-r--r--net/ipv4/datagram.c11
-rw-r--r--net/ipv4/devinet.c12
-rw-r--r--net/ipv4/fib_frontend.c220
-rw-r--r--net/ipv4/fib_hash.c329
-rw-r--r--net/ipv4/fib_lookup.h11
-rw-r--r--net/ipv4/fib_rules.c13
-rw-r--r--net/ipv4/fib_semantics.c297
-rw-r--r--net/ipv4/fib_trie.c87
-rw-r--r--net/ipv4/gre.c152
-rw-r--r--net/ipv4/icmp.c41
-rw-r--r--net/ipv4/igmp.c66
-rw-r--r--net/ipv4/inet_connection_sock.c21
-rw-r--r--net/ipv4/inet_diag.c2
-rw-r--r--net/ipv4/inet_fragment.c1
-rw-r--r--net/ipv4/inet_hashtables.c32
-rw-r--r--net/ipv4/inetpeer.c370
-rw-r--r--net/ipv4/ip_forward.c10
-rw-r--r--net/ipv4/ip_fragment.c33
-rw-r--r--net/ipv4/ip_gre.c266
-rw-r--r--net/ipv4/ip_input.c26
-rw-r--r--net/ipv4/ip_options.c3
-rw-r--r--net/ipv4/ip_output.c119
-rw-r--r--net/ipv4/ip_sockglue.c58
-rw-r--r--net/ipv4/ipconfig.c7
-rw-r--r--net/ipv4/ipip.c221
-rw-r--r--net/ipv4/ipmr.c436
-rw-r--r--net/ipv4/netfilter.c12
-rw-r--r--net/ipv4/netfilter/Kconfig6
-rw-r--r--net/ipv4/netfilter/arp_tables.c91
-rw-r--r--net/ipv4/netfilter/arpt_mangle.c2
-rw-r--r--net/ipv4/netfilter/ip_queue.c57
-rw-r--r--net/ipv4/netfilter/ip_tables.c105
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c81
-rw-r--r--net/ipv4/netfilter/ipt_LOG.c185
-rw-r--r--net/ipv4/netfilter/ipt_NETMAP.c6
-rw-r--r--net/ipv4/netfilter/ipt_REJECT.c13
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c28
-rw-r--r--net/ipv4/netfilter/nf_defrag_ipv4.c7
-rw-r--r--net/ipv4/netfilter/nf_nat_amanda.c9
-rw-r--r--net/ipv4/netfilter/nf_nat_core.c78
-rw-r--r--net/ipv4/netfilter/nf_nat_ftp.c9
-rw-r--r--net/ipv4/netfilter/nf_nat_h323.c53
-rw-r--r--net/ipv4/netfilter/nf_nat_helper.c76
-rw-r--r--net/ipv4/netfilter/nf_nat_irc.c9
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_common.c12
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_dccp.c6
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_gre.c12
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_icmp.c10
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_sctp.c6
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_tcp.c5
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_udp.c5
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_udplite.c6
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_unknown.c4
-rw-r--r--net/ipv4/netfilter/nf_nat_rule.c27
-rw-r--r--net/ipv4/netfilter/nf_nat_sip.c27
-rw-r--r--net/ipv4/netfilter/nf_nat_snmp_basic.c6
-rw-r--r--net/ipv4/netfilter/nf_nat_standalone.c10
-rw-r--r--net/ipv4/proc.c16
-rw-r--r--net/ipv4/protocol.c36
-rw-r--r--net/ipv4/raw.c24
-rw-r--r--net/ipv4/route.c774
-rw-r--r--net/ipv4/syncookies.c105
-rw-r--r--net/ipv4/tcp.c130
-rw-r--r--net/ipv4/tcp_cong.c5
-rw-r--r--net/ipv4/tcp_illinois.c2
-rw-r--r--net/ipv4/tcp_input.c86
-rw-r--r--net/ipv4/tcp_ipv4.c187
-rw-r--r--net/ipv4/tcp_minisocks.c11
-rw-r--r--net/ipv4/tcp_output.c108
-rw-r--r--net/ipv4/tcp_probe.c1
-rw-r--r--net/ipv4/tcp_timer.c69
-rw-r--r--net/ipv4/tcp_veno.c2
-rw-r--r--net/ipv4/tcp_westwood.c2
-rw-r--r--net/ipv4/tunnel4.c46
-rw-r--r--net/ipv4/udp.c54
-rw-r--r--net/ipv4/udplite.c3
-rw-r--r--net/ipv4/xfrm4_input.c1
-rw-r--r--net/ipv4/xfrm4_policy.c8
-rw-r--r--net/ipv4/xfrm4_state.c33
-rw-r--r--net/ipv4/xfrm4_tunnel.c4
85 files changed, 3399 insertions, 2519 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 7c3a7d191249..9e95d7fb6d5a 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -46,7 +46,7 @@ config IP_ADVANCED_ROUTER
rp_filter on use:
echo 1 > /proc/sys/net/ipv4/conf/<device>/rp_filter
- and
+ or
echo 1 > /proc/sys/net/ipv4/conf/all/rp_filter
Note that some distributions enable it in startup scripts.
@@ -84,7 +84,7 @@ config IP_FIB_TRIE
An experimental study of compression methods for dynamic tries
Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
- http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
+ <http://www.csc.kth.se/~snilsson/software/dyntrie2/>
endchoice
@@ -215,8 +215,15 @@ config NET_IPIP
be inserted in and removed from the running kernel whenever you
want). Most people won't need this and can say N.
+config NET_IPGRE_DEMUX
+ tristate "IP: GRE demultiplexer"
+ help
+ This is helper module to demultiplex GRE packets on GRE version field criteria.
+ Required by ip_gre and pptp modules.
+
config NET_IPGRE
tristate "IP: GRE tunnels over IP"
+ depends on (IPV6 || IPV6=n) && NET_IPGRE_DEMUX
help
Tunneling means encapsulating data of one protocol type within
another protocol and sending it over a channel that understands the
@@ -412,7 +419,7 @@ config INET_XFRM_MODE_BEET
If unsure, say Y.
config INET_LRO
- bool "Large Receive Offload (ipv4/tcp)"
+ tristate "Large Receive Offload (ipv4/tcp)"
default y
---help---
Support for Large Receive Offload (ipv4/tcp).
@@ -555,7 +562,7 @@ config TCP_CONG_VENO
distinguishing to circumvent the difficult judgment of the packet loss
type. TCP Veno cuts down less congestion window in response to random
loss packets.
- See http://www.ntu.edu.sg/home5/ZHOU0022/papers/CPFu03a.pdf
+ See <http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=1177186>
config TCP_CONG_YEAH
tristate "YeAH TCP"
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 80ff87ce43aa..4978d22f9a75 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -20,6 +20,7 @@ obj-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o
obj-$(CONFIG_IP_MROUTE) += ipmr.o
obj-$(CONFIG_NET_IPIP) += ipip.o
+obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o
obj-$(CONFIG_NET_IPGRE) += ip_gre.o
obj-$(CONFIG_SYN_COOKIES) += syncookies.o
obj-$(CONFIG_INET_AH) += ah4.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 551ce564b035..f581f77d1097 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -227,18 +227,16 @@ EXPORT_SYMBOL(inet_ehash_secret);
/*
* inet_ehash_secret must be set exactly once
- * Instead of using a dedicated spinlock, we (ab)use inetsw_lock
*/
void build_ehash_secret(void)
{
u32 rnd;
+
do {
get_random_bytes(&rnd, sizeof(rnd));
} while (rnd == 0);
- spin_lock_bh(&inetsw_lock);
- if (!inet_ehash_secret)
- inet_ehash_secret = rnd;
- spin_unlock_bh(&inetsw_lock);
+
+ cmpxchg(&inet_ehash_secret, 0, rnd);
}
EXPORT_SYMBOL(build_ehash_secret);
@@ -355,6 +353,8 @@ lookup_protocol:
inet = inet_sk(sk);
inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;
+ inet->nodefrag = 0;
+
if (SOCK_RAW == sock->type) {
inet->inet_num = protocol;
if (IPPROTO_RAW == protocol)
@@ -725,28 +725,31 @@ int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
sock_rps_record_flow(sk);
/* We may need to bind the socket. */
- if (!inet_sk(sk)->inet_num && inet_autobind(sk))
+ if (!inet_sk(sk)->inet_num && !sk->sk_prot->no_autobind &&
+ inet_autobind(sk))
return -EAGAIN;
return sk->sk_prot->sendmsg(iocb, sk, msg, size);
}
EXPORT_SYMBOL(inet_sendmsg);
-static ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
- size_t size, int flags)
+ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
+ size_t size, int flags)
{
struct sock *sk = sock->sk;
sock_rps_record_flow(sk);
/* We may need to bind the socket. */
- if (!inet_sk(sk)->inet_num && inet_autobind(sk))
+ if (!inet_sk(sk)->inet_num && !sk->sk_prot->no_autobind &&
+ inet_autobind(sk))
return -EAGAIN;
if (sk->sk_prot->sendpage)
return sk->sk_prot->sendpage(sk, page, offset, size, flags);
return sock_no_sendpage(sock, page, offset, size, flags);
}
+EXPORT_SYMBOL(inet_sendpage);
int inet_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
size_t size, int flags)
@@ -892,10 +895,10 @@ const struct proto_ops inet_stream_ops = {
.shutdown = inet_shutdown,
.setsockopt = sock_common_setsockopt,
.getsockopt = sock_common_getsockopt,
- .sendmsg = tcp_sendmsg,
+ .sendmsg = inet_sendmsg,
.recvmsg = inet_recvmsg,
.mmap = sock_no_mmap,
- .sendpage = tcp_sendpage,
+ .sendpage = inet_sendpage,
.splice_read = tcp_splice_read,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_sock_common_setsockopt,
@@ -1100,7 +1103,7 @@ static int inet_sk_reselect_saddr(struct sock *sk)
if (err)
return err;
- sk_setup_caps(sk, &rt->u.dst);
+ sk_setup_caps(sk, &rt->dst);
new_saddr = rt->rt_src;
@@ -1166,7 +1169,7 @@ int inet_sk_rebuild_header(struct sock *sk)
err = ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0);
}
if (!err)
- sk_setup_caps(sk, &rt->u.dst);
+ sk_setup_caps(sk, &rt->dst);
else {
/* Routing failed... */
sk->sk_route_caps = 0;
@@ -1425,13 +1428,49 @@ unsigned long snmp_fold_field(void __percpu *mib[], int offt)
}
EXPORT_SYMBOL_GPL(snmp_fold_field);
-int snmp_mib_init(void __percpu *ptr[2], size_t mibsize)
+#if BITS_PER_LONG==32
+
+u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t syncp_offset)
+{
+ u64 res = 0;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ void *bhptr, *userptr;
+ struct u64_stats_sync *syncp;
+ u64 v_bh, v_user;
+ unsigned int start;
+
+ /* first mib used by softirq context, we must use _bh() accessors */
+ bhptr = per_cpu_ptr(SNMP_STAT_BHPTR(mib), cpu);
+ syncp = (struct u64_stats_sync *)(bhptr + syncp_offset);
+ do {
+ start = u64_stats_fetch_begin_bh(syncp);
+ v_bh = *(((u64 *) bhptr) + offt);
+ } while (u64_stats_fetch_retry_bh(syncp, start));
+
+ /* second mib used in USER context */
+ userptr = per_cpu_ptr(SNMP_STAT_USRPTR(mib), cpu);
+ syncp = (struct u64_stats_sync *)(userptr + syncp_offset);
+ do {
+ start = u64_stats_fetch_begin(syncp);
+ v_user = *(((u64 *) userptr) + offt);
+ } while (u64_stats_fetch_retry(syncp, start));
+
+ res += v_bh + v_user;
+ }
+ return res;
+}
+EXPORT_SYMBOL_GPL(snmp_fold_field64);
+#endif
+
+int snmp_mib_init(void __percpu *ptr[2], size_t mibsize, size_t align)
{
BUG_ON(ptr == NULL);
- ptr[0] = __alloc_percpu(mibsize, __alignof__(unsigned long));
+ ptr[0] = __alloc_percpu(mibsize, align);
if (!ptr[0])
goto err0;
- ptr[1] = __alloc_percpu(mibsize, __alignof__(unsigned long));
+ ptr[1] = __alloc_percpu(mibsize, align);
if (!ptr[1])
goto err1;
return 0;
@@ -1488,25 +1527,32 @@ static const struct net_protocol icmp_protocol = {
static __net_init int ipv4_mib_init_net(struct net *net)
{
if (snmp_mib_init((void __percpu **)net->mib.tcp_statistics,
- sizeof(struct tcp_mib)) < 0)
+ sizeof(struct tcp_mib),
+ __alignof__(struct tcp_mib)) < 0)
goto err_tcp_mib;
if (snmp_mib_init((void __percpu **)net->mib.ip_statistics,
- sizeof(struct ipstats_mib)) < 0)
+ sizeof(struct ipstats_mib),
+ __alignof__(struct ipstats_mib)) < 0)
goto err_ip_mib;
if (snmp_mib_init((void __percpu **)net->mib.net_statistics,
- sizeof(struct linux_mib)) < 0)
+ sizeof(struct linux_mib),
+ __alignof__(struct linux_mib)) < 0)
goto err_net_mib;
if (snmp_mib_init((void __percpu **)net->mib.udp_statistics,
- sizeof(struct udp_mib)) < 0)
+ sizeof(struct udp_mib),
+ __alignof__(struct udp_mib)) < 0)
goto err_udp_mib;
if (snmp_mib_init((void __percpu **)net->mib.udplite_statistics,
- sizeof(struct udp_mib)) < 0)
+ sizeof(struct udp_mib),
+ __alignof__(struct udp_mib)) < 0)
goto err_udplite_mib;
if (snmp_mib_init((void __percpu **)net->mib.icmp_statistics,
- sizeof(struct icmp_mib)) < 0)
+ sizeof(struct icmp_mib),
+ __alignof__(struct icmp_mib)) < 0)
goto err_icmp_mib;
if (snmp_mib_init((void __percpu **)net->mib.icmpmsg_statistics,
- sizeof(struct icmpmsg_mib)) < 0)
+ sizeof(struct icmpmsg_mib),
+ __alignof__(struct icmpmsg_mib)) < 0)
goto err_icmpmsg_mib;
tcp_mib_init(net);
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index f094b75810db..d8e540c5b071 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -55,7 +55,7 @@
* Stuart Cheshire : Metricom and grat arp fixes
* *** FOR 2.1 clean this up ***
* Lawrence V. Stefani: (08/12/96) Added FDDI support.
- * Alan Cox : Took the AP1000 nasty FDDI hack and
+ * Alan Cox : Took the AP1000 nasty FDDI hack and
* folded into the mainstream FDDI code.
* Ack spit, Linus how did you allow that
* one in...
@@ -116,17 +116,18 @@
#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
#include <net/atmclip.h>
struct neigh_table *clip_tbl_hook;
+EXPORT_SYMBOL(clip_tbl_hook);
#endif
#include <asm/system.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/netfilter_arp.h>
/*
* Interface to generic neighbour cache.
*/
-static u32 arp_hash(const void *pkey, const struct net_device *dev);
+static u32 arp_hash(const void *pkey, const struct net_device *dev, __u32 rnd);
static int arp_constructor(struct neighbour *neigh);
static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb);
static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb);
@@ -160,7 +161,7 @@ static const struct neigh_ops arp_direct_ops = {
.queue_xmit = dev_queue_xmit,
};
-const struct neigh_ops arp_broken_ops = {
+static const struct neigh_ops arp_broken_ops = {
.family = AF_INET,
.solicit = arp_solicit,
.error_report = arp_error_report,
@@ -171,33 +172,34 @@ const struct neigh_ops arp_broken_ops = {
};
struct neigh_table arp_tbl = {
- .family = AF_INET,
- .entry_size = sizeof(struct neighbour) + 4,
- .key_len = 4,
- .hash = arp_hash,
- .constructor = arp_constructor,
- .proxy_redo = parp_redo,
- .id = "arp_cache",
- .parms = {
- .tbl = &arp_tbl,
- .base_reachable_time = 30 * HZ,
- .retrans_time = 1 * HZ,
- .gc_staletime = 60 * HZ,
- .reachable_time = 30 * HZ,
- .delay_probe_time = 5 * HZ,
- .queue_len = 3,
- .ucast_probes = 3,
- .mcast_probes = 3,
- .anycast_delay = 1 * HZ,
- .proxy_delay = (8 * HZ) / 10,
- .proxy_qlen = 64,
- .locktime = 1 * HZ,
+ .family = AF_INET,
+ .entry_size = sizeof(struct neighbour) + 4,
+ .key_len = 4,
+ .hash = arp_hash,
+ .constructor = arp_constructor,
+ .proxy_redo = parp_redo,
+ .id = "arp_cache",
+ .parms = {
+ .tbl = &arp_tbl,
+ .base_reachable_time = 30 * HZ,
+ .retrans_time = 1 * HZ,
+ .gc_staletime = 60 * HZ,
+ .reachable_time = 30 * HZ,
+ .delay_probe_time = 5 * HZ,
+ .queue_len = 3,
+ .ucast_probes = 3,
+ .mcast_probes = 3,
+ .anycast_delay = 1 * HZ,
+ .proxy_delay = (8 * HZ) / 10,
+ .proxy_qlen = 64,
+ .locktime = 1 * HZ,
},
- .gc_interval = 30 * HZ,
- .gc_thresh1 = 128,
- .gc_thresh2 = 512,
- .gc_thresh3 = 1024,
+ .gc_interval = 30 * HZ,
+ .gc_thresh1 = 128,
+ .gc_thresh2 = 512,
+ .gc_thresh3 = 1024,
};
+EXPORT_SYMBOL(arp_tbl);
int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir)
{
@@ -223,14 +225,16 @@ int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir)
}
-static u32 arp_hash(const void *pkey, const struct net_device *dev)
+static u32 arp_hash(const void *pkey,
+ const struct net_device *dev,
+ __u32 hash_rnd)
{
- return jhash_2words(*(u32 *)pkey, dev->ifindex, arp_tbl.hash_rnd);
+ return jhash_2words(*(u32 *)pkey, dev->ifindex, hash_rnd);
}
static int arp_constructor(struct neighbour *neigh)
{
- __be32 addr = *(__be32*)neigh->primary_key;
+ __be32 addr = *(__be32 *)neigh->primary_key;
struct net_device *dev = neigh->dev;
struct in_device *in_dev;
struct neigh_parms *parms;
@@ -293,16 +297,19 @@ static int arp_constructor(struct neighbour *neigh)
neigh->ops = &arp_broken_ops;
neigh->output = neigh->ops->output;
return 0;
+#else
+ break;
#endif
- ;}
+ }
#endif
if (neigh->type == RTN_MULTICAST) {
neigh->nud_state = NUD_NOARP;
arp_mc_map(addr, neigh->ha, dev, 1);
- } else if (dev->flags&(IFF_NOARP|IFF_LOOPBACK)) {
+ } else if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) {
neigh->nud_state = NUD_NOARP;
memcpy(neigh->ha, dev->dev_addr, dev->addr_len);
- } else if (neigh->type == RTN_BROADCAST || dev->flags&IFF_POINTOPOINT) {
+ } else if (neigh->type == RTN_BROADCAST ||
+ (dev->flags & IFF_POINTOPOINT)) {
neigh->nud_state = NUD_NOARP;
memcpy(neigh->ha, dev->broadcast, dev->addr_len);
}
@@ -312,7 +319,7 @@ static int arp_constructor(struct neighbour *neigh)
else
neigh->ops = &arp_generic_ops;
- if (neigh->nud_state&NUD_VALID)
+ if (neigh->nud_state & NUD_VALID)
neigh->output = neigh->ops->connected_output;
else
neigh->output = neigh->ops->output;
@@ -331,17 +338,21 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
__be32 saddr = 0;
u8 *dst_ha = NULL;
struct net_device *dev = neigh->dev;
- __be32 target = *(__be32*)neigh->primary_key;
+ __be32 target = *(__be32 *)neigh->primary_key;
int probes = atomic_read(&neigh->probes);
- struct in_device *in_dev = in_dev_get(dev);
+ struct in_device *in_dev;
- if (!in_dev)
+ rcu_read_lock();
+ in_dev = __in_dev_get_rcu(dev);
+ if (!in_dev) {
+ rcu_read_unlock();
return;
-
+ }
switch (IN_DEV_ARP_ANNOUNCE(in_dev)) {
default:
case 0: /* By default announce any local IP */
- if (skb && inet_addr_type(dev_net(dev), ip_hdr(skb)->saddr) == RTN_LOCAL)
+ if (skb && inet_addr_type(dev_net(dev),
+ ip_hdr(skb)->saddr) == RTN_LOCAL)
saddr = ip_hdr(skb)->saddr;
break;
case 1: /* Restrict announcements of saddr in same subnet */
@@ -358,22 +369,26 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
case 2: /* Avoid secondary IPs, get a primary/preferred one */
break;
}
+ rcu_read_unlock();
- if (in_dev)
- in_dev_put(in_dev);
if (!saddr)
saddr = inet_select_addr(dev, target, RT_SCOPE_LINK);
- if ((probes -= neigh->parms->ucast_probes) < 0) {
- if (!(neigh->nud_state&NUD_VALID))
- printk(KERN_DEBUG "trying to ucast probe in NUD_INVALID\n");
+ probes -= neigh->parms->ucast_probes;
+ if (probes < 0) {
+ if (!(neigh->nud_state & NUD_VALID))
+ printk(KERN_DEBUG
+ "trying to ucast probe in NUD_INVALID\n");
dst_ha = neigh->ha;
read_lock_bh(&neigh->lock);
- } else if ((probes -= neigh->parms->app_probes) < 0) {
+ } else {
+ probes -= neigh->parms->app_probes;
+ if (probes < 0) {
#ifdef CONFIG_ARPD
- neigh_app_ns(neigh);
+ neigh_app_ns(neigh);
#endif
- return;
+ return;
+ }
}
arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr,
@@ -427,7 +442,7 @@ static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev)
if (ip_route_output_key(net, &rt, &fl) < 0)
return 1;
- if (rt->u.dst.dev != dev) {
+ if (rt->dst.dev != dev) {
NET_INC_STATS_BH(net, LINUX_MIB_ARPFILTER);
flag = 1;
}
@@ -446,7 +461,8 @@ static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev)
* is allowed to use this function, it is scheduled to be removed. --ANK
*/
-static int arp_set_predefined(int addr_hint, unsigned char * haddr, __be32 paddr, struct net_device * dev)
+static int arp_set_predefined(int addr_hint, unsigned char *haddr,
+ __be32 paddr, struct net_device *dev)
{
switch (addr_hint) {
case RTN_LOCAL:
@@ -478,17 +494,16 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb)
paddr = skb_rtable(skb)->rt_gateway;
- if (arp_set_predefined(inet_addr_type(dev_net(dev), paddr), haddr, paddr, dev))
+ if (arp_set_predefined(inet_addr_type(dev_net(dev), paddr), haddr,
+ paddr, dev))
return 0;
n = __neigh_lookup(&arp_tbl, &paddr, dev, 1);
if (n) {
n->used = jiffies;
- if (n->nud_state&NUD_VALID || neigh_event_send(n, skb) == 0) {
- read_lock_bh(&n->lock);
- memcpy(haddr, n->ha, dev->addr_len);
- read_unlock_bh(&n->lock);
+ if (n->nud_state & NUD_VALID || neigh_event_send(n, skb) == 0) {
+ neigh_ha_snapshot(haddr, n, dev);
neigh_release(n);
return 0;
}
@@ -497,6 +512,7 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb)
kfree_skb(skb);
return 1;
}
+EXPORT_SYMBOL(arp_find);
/* END OF OBSOLETE FUNCTIONS */
@@ -509,13 +525,14 @@ int arp_bind_neighbour(struct dst_entry *dst)
return -EINVAL;
if (n == NULL) {
__be32 nexthop = ((struct rtable *)dst)->rt_gateway;
- if (dev->flags&(IFF_LOOPBACK|IFF_POINTOPOINT))
+ if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
nexthop = 0;
n = __neigh_lookup_errno(
#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
- dev->type == ARPHRD_ATM ? clip_tbl_hook :
+ dev->type == ARPHRD_ATM ?
+ clip_tbl_hook :
#endif
- &arp_tbl, &nexthop, dev);
+ &arp_tbl, &nexthop, dev);
if (IS_ERR(n))
return PTR_ERR(n);
dst->neighbour = n;
@@ -532,24 +549,24 @@ static inline int arp_fwd_proxy(struct in_device *in_dev,
struct in_device *out_dev;
int imi, omi = -1;
- if (rt->u.dst.dev == dev)
+ if (rt->dst.dev == dev)
return 0;
if (!IN_DEV_PROXY_ARP(in_dev))
return 0;
-
- if ((imi = IN_DEV_MEDIUM_ID(in_dev)) == 0)
+ imi = IN_DEV_MEDIUM_ID(in_dev);
+ if (imi == 0)
return 1;
if (imi == -1)
return 0;
/* place to check for proxy_arp for routes */
- if ((out_dev = in_dev_get(rt->u.dst.dev)) != NULL) {
+ out_dev = __in_dev_get_rcu(rt->dst.dev);
+ if (out_dev)
omi = IN_DEV_MEDIUM_ID(out_dev);
- in_dev_put(out_dev);
- }
- return (omi != imi && omi != -1);
+
+ return omi != imi && omi != -1;
}
/*
@@ -576,7 +593,7 @@ static inline int arp_fwd_pvlan(struct in_device *in_dev,
__be32 sip, __be32 tip)
{
/* Private VLAN is only concerned about the same ethernet segment */
- if (rt->u.dst.dev != dev)
+ if (rt->dst.dev != dev)
return 0;
/* Don't reply on self probes (often done by windowz boxes)*/
@@ -679,7 +696,7 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
arp->ar_pln = 4;
arp->ar_op = htons(type);
- arp_ptr=(unsigned char *)(arp+1);
+ arp_ptr = (unsigned char *)(arp + 1);
memcpy(arp_ptr, src_hw, dev->addr_len);
arp_ptr += dev->addr_len;
@@ -698,6 +715,7 @@ out:
kfree_skb(skb);
return NULL;
}
+EXPORT_SYMBOL(arp_create);
/*
* Send an arp packet.
@@ -707,6 +725,7 @@ void arp_xmit(struct sk_buff *skb)
/* Send it off, maybe filter it using firewalling first. */
NF_HOOK(NFPROTO_ARP, NF_ARP_OUT, skb, NULL, skb->dev, dev_queue_xmit);
}
+EXPORT_SYMBOL(arp_xmit);
/*
* Create and send an arp packet.
@@ -727,12 +746,12 @@ void arp_send(int type, int ptype, __be32 dest_ip,
skb = arp_create(type, ptype, dest_ip, dev, src_ip,
dest_hw, src_hw, target_hw);
- if (skb == NULL) {
+ if (skb == NULL)
return;
- }
arp_xmit(skb);
}
+EXPORT_SYMBOL(arp_send);
/*
* Process an arp request.
@@ -741,7 +760,7 @@ void arp_send(int type, int ptype, __be32 dest_ip,
static int arp_process(struct sk_buff *skb)
{
struct net_device *dev = skb->dev;
- struct in_device *in_dev = in_dev_get(dev);
+ struct in_device *in_dev = __in_dev_get_rcu(dev);
struct arphdr *arp;
unsigned char *arp_ptr;
struct rtable *rt;
@@ -806,7 +825,7 @@ static int arp_process(struct sk_buff *skb)
/*
* Extract fields
*/
- arp_ptr= (unsigned char *)(arp+1);
+ arp_ptr = (unsigned char *)(arp + 1);
sha = arp_ptr;
arp_ptr += dev->addr_len;
memcpy(&sip, arp_ptr, 4);
@@ -860,16 +879,17 @@ static int arp_process(struct sk_buff *skb)
addr_type = rt->rt_type;
if (addr_type == RTN_LOCAL) {
- int dont_send = 0;
+ int dont_send;
- if (!dont_send)
- dont_send |= arp_ignore(in_dev,sip,tip);
+ dont_send = arp_ignore(in_dev, sip, tip);
if (!dont_send && IN_DEV_ARPFILTER(in_dev))
- dont_send |= arp_filter(sip,tip,dev);
+ dont_send |= arp_filter(sip, tip, dev);
if (!dont_send) {
n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
if (n) {
- arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha);
+ arp_send(ARPOP_REPLY, ETH_P_ARP, sip,
+ dev, tip, sha, dev->dev_addr,
+ sha);
neigh_release(n);
}
}
@@ -878,8 +898,7 @@ static int arp_process(struct sk_buff *skb)
if (addr_type == RTN_UNICAST &&
(arp_fwd_proxy(in_dev, dev, rt) ||
arp_fwd_pvlan(in_dev, dev, rt, sip, tip) ||
- pneigh_lookup(&arp_tbl, net, &tip, dev, 0)))
- {
+ pneigh_lookup(&arp_tbl, net, &tip, dev, 0))) {
n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
if (n)
neigh_release(n);
@@ -887,10 +906,12 @@ static int arp_process(struct sk_buff *skb)
if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED ||
skb->pkt_type == PACKET_HOST ||
in_dev->arp_parms->proxy_delay == 0) {
- arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha);
+ arp_send(ARPOP_REPLY, ETH_P_ARP, sip,
+ dev, tip, sha, dev->dev_addr,
+ sha);
} else {
- pneigh_enqueue(&arp_tbl, in_dev->arp_parms, skb);
- in_dev_put(in_dev);
+ pneigh_enqueue(&arp_tbl,
+ in_dev->arp_parms, skb);
return 0;
}
goto out;
@@ -931,13 +952,12 @@ static int arp_process(struct sk_buff *skb)
if (arp->ar_op != htons(ARPOP_REPLY) ||
skb->pkt_type != PACKET_HOST)
state = NUD_STALE;
- neigh_update(n, sha, state, override ? NEIGH_UPDATE_F_OVERRIDE : 0);
+ neigh_update(n, sha, state,
+ override ? NEIGH_UPDATE_F_OVERRIDE : 0);
neigh_release(n);
}
out:
- if (in_dev)
- in_dev_put(in_dev);
consume_skb(skb);
return 0;
}
@@ -969,7 +989,8 @@ static int arp_rcv(struct sk_buff *skb, struct net_device *dev,
arp->ar_pln != 4)
goto freeskb;
- if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
+ skb = skb_share_check(skb, GFP_ATOMIC);
+ if (skb == NULL)
goto out_of_mem;
memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb));
@@ -1013,7 +1034,7 @@ static int arp_req_set_public(struct net *net, struct arpreq *r,
return -EINVAL;
if (!dev && (r->arp_flags & ATF_COM)) {
dev = dev_getbyhwaddr(net, r->arp_ha.sa_family,
- r->arp_ha.sa_data);
+ r->arp_ha.sa_data);
if (!dev)
return -ENODEV;
}
@@ -1027,7 +1048,7 @@ static int arp_req_set_public(struct net *net, struct arpreq *r,
}
static int arp_req_set(struct net *net, struct arpreq *r,
- struct net_device * dev)
+ struct net_device *dev)
{
__be32 ip;
struct neighbour *neigh;
@@ -1040,12 +1061,13 @@ static int arp_req_set(struct net *net, struct arpreq *r,
if (r->arp_flags & ATF_PERM)
r->arp_flags |= ATF_COM;
if (dev == NULL) {
- struct flowi fl = { .nl_u = { .ip4_u = { .daddr = ip,
- .tos = RTO_ONLINK } } };
- struct rtable * rt;
- if ((err = ip_route_output_key(net, &rt, &fl)) != 0)
+ struct flowi fl = { .nl_u.ip4_u = { .daddr = ip,
+ .tos = RTO_ONLINK } };
+ struct rtable *rt;
+ err = ip_route_output_key(net, &rt, &fl);
+ if (err != 0)
return err;
- dev = rt->u.dst.dev;
+ dev = rt->dst.dev;
ip_rt_put(rt);
if (!dev)
return -EINVAL;
@@ -1077,9 +1099,9 @@ static int arp_req_set(struct net *net, struct arpreq *r,
unsigned state = NUD_STALE;
if (r->arp_flags & ATF_PERM)
state = NUD_PERMANENT;
- err = neigh_update(neigh, (r->arp_flags&ATF_COM) ?
+ err = neigh_update(neigh, (r->arp_flags & ATF_COM) ?
r->arp_ha.sa_data : NULL, state,
- NEIGH_UPDATE_F_OVERRIDE|
+ NEIGH_UPDATE_F_OVERRIDE |
NEIGH_UPDATE_F_ADMIN);
neigh_release(neigh);
}
@@ -1088,12 +1110,12 @@ static int arp_req_set(struct net *net, struct arpreq *r,
static unsigned arp_state_to_flags(struct neighbour *neigh)
{
- unsigned flags = 0;
if (neigh->nud_state&NUD_PERMANENT)
- flags = ATF_PERM|ATF_COM;
+ return ATF_PERM | ATF_COM;
else if (neigh->nud_state&NUD_VALID)
- flags = ATF_COM;
- return flags;
+ return ATF_COM;
+ else
+ return 0;
}
/*
@@ -1136,7 +1158,7 @@ static int arp_req_delete_public(struct net *net, struct arpreq *r,
}
static int arp_req_delete(struct net *net, struct arpreq *r,
- struct net_device * dev)
+ struct net_device *dev)
{
int err;
__be32 ip;
@@ -1147,12 +1169,13 @@ static int arp_req_delete(struct net *net, struct arpreq *r,
ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
if (dev == NULL) {
- struct flowi fl = { .nl_u = { .ip4_u = { .daddr = ip,
- .tos = RTO_ONLINK } } };
- struct rtable * rt;
- if ((err = ip_route_output_key(net, &rt, &fl)) != 0)
+ struct flowi fl = { .nl_u.ip4_u = { .daddr = ip,
+ .tos = RTO_ONLINK } };
+ struct rtable *rt;
+ err = ip_route_output_key(net, &rt, &fl);
+ if (err != 0)
return err;
- dev = rt->u.dst.dev;
+ dev = rt->dst.dev;
ip_rt_put(rt);
if (!dev)
return -EINVAL;
@@ -1160,7 +1183,7 @@ static int arp_req_delete(struct net *net, struct arpreq *r,
err = -ENXIO;
neigh = neigh_lookup(&arp_tbl, &ip, dev);
if (neigh) {
- if (neigh->nud_state&~NUD_NOARP)
+ if (neigh->nud_state & ~NUD_NOARP)
err = neigh_update(neigh, NULL, NUD_FAILED,
NEIGH_UPDATE_F_OVERRIDE|
NEIGH_UPDATE_F_ADMIN);
@@ -1180,24 +1203,24 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg)
struct net_device *dev = NULL;
switch (cmd) {
- case SIOCDARP:
- case SIOCSARP:
- if (!capable(CAP_NET_ADMIN))
- return -EPERM;
- case SIOCGARP:
- err = copy_from_user(&r, arg, sizeof(struct arpreq));
- if (err)
- return -EFAULT;
- break;
- default:
- return -EINVAL;
+ case SIOCDARP:
+ case SIOCSARP:
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+ case SIOCGARP:
+ err = copy_from_user(&r, arg, sizeof(struct arpreq));
+ if (err)
+ return -EFAULT;
+ break;
+ default:
+ return -EINVAL;
}
if (r.arp_pa.sa_family != AF_INET)
return -EPFNOSUPPORT;
if (!(r.arp_flags & ATF_PUBL) &&
- (r.arp_flags & (ATF_NETMASK|ATF_DONTPUB)))
+ (r.arp_flags & (ATF_NETMASK | ATF_DONTPUB)))
return -EINVAL;
if (!(r.arp_flags & ATF_NETMASK))
((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr =
@@ -1205,7 +1228,8 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg)
rtnl_lock();
if (r.arp_dev[0]) {
err = -ENODEV;
- if ((dev = __dev_get_by_name(net, r.arp_dev)) == NULL)
+ dev = __dev_get_by_name(net, r.arp_dev);
+ if (dev == NULL)
goto out;
/* Mmmm... It is wrong... ARPHRD_NETROM==0 */
@@ -1237,7 +1261,8 @@ out:
return err;
}
-static int arp_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
+static int arp_netdev_event(struct notifier_block *this, unsigned long event,
+ void *ptr)
{
struct net_device *dev = ptr;
@@ -1305,12 +1330,13 @@ static char *ax2asc2(ax25_address *a, char *buf)
for (n = 0, s = buf; n < 6; n++) {
c = (a->ax25_call[n] >> 1) & 0x7F;
- if (c != ' ') *s++ = c;
+ if (c != ' ')
+ *s++ = c;
}
*s++ = '-';
-
- if ((n = ((a->ax25_call[6] >> 1) & 0x0F)) > 9) {
+ n = (a->ax25_call[6] >> 1) & 0x0F;
+ if (n > 9) {
*s++ = '1';
n -= 10;
}
@@ -1319,10 +1345,9 @@ static char *ax2asc2(ax25_address *a, char *buf)
*s++ = '\0';
if (*buf == '\0' || *buf == '-')
- return "*";
+ return "*";
return buf;
-
}
#endif /* CONFIG_AX25 */
@@ -1402,10 +1427,10 @@ static void *arp_seq_start(struct seq_file *seq, loff_t *pos)
/* ------------------------------------------------------------------------ */
static const struct seq_operations arp_seq_ops = {
- .start = arp_seq_start,
- .next = neigh_seq_next,
- .stop = neigh_seq_stop,
- .show = arp_seq_show,
+ .start = arp_seq_start,
+ .next = neigh_seq_next,
+ .stop = neigh_seq_stop,
+ .show = arp_seq_show,
};
static int arp_seq_open(struct inode *inode, struct file *file)
@@ -1453,14 +1478,3 @@ static int __init arp_proc_init(void)
}
#endif /* CONFIG_PROC_FS */
-
-EXPORT_SYMBOL(arp_broken_ops);
-EXPORT_SYMBOL(arp_find);
-EXPORT_SYMBOL(arp_create);
-EXPORT_SYMBOL(arp_xmit);
-EXPORT_SYMBOL(arp_send);
-EXPORT_SYMBOL(arp_tbl);
-
-#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
-EXPORT_SYMBOL(clip_tbl_hook);
-#endif
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 3a92a76ae41d..094e150c6260 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -9,7 +9,7 @@
*
* The CIPSO draft specification can be found in the kernel's Documentation
* directory as well as the following URL:
- * http://netlabel.sourceforge.net/files/draft-ietf-cipso-ipsecurity-01.txt
+ * http://tools.ietf.org/id/draft-ietf-cipso-ipsecurity-01.txt
* The FIPS-188 specification can be found at the following URL:
* http://www.itl.nist.gov/fipspubs/fip188.htm
*
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index fb2465811b48..174be6caa5c8 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -62,16 +62,17 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
}
if (!inet->inet_saddr)
inet->inet_saddr = rt->rt_src; /* Update source address */
- if (!inet->inet_rcv_saddr)
+ if (!inet->inet_rcv_saddr) {
inet->inet_rcv_saddr = rt->rt_src;
+ if (sk->sk_prot->rehash)
+ sk->sk_prot->rehash(sk);
+ }
inet->inet_daddr = rt->rt_dst;
inet->inet_dport = usin->sin_port;
sk->sk_state = TCP_ESTABLISHED;
inet->inet_id = jiffies;
- sk_dst_set(sk, &rt->u.dst);
- return(0);
+ sk_dst_set(sk, &rt->dst);
+ return 0;
}
-
EXPORT_SYMBOL(ip4_datagram_connect);
-
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 382bc768ed56..dc94b0316b78 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -209,7 +209,7 @@ static void inetdev_destroy(struct in_device *in_dev)
inet_free_ifa(ifa);
}
- dev->ip_ptr = NULL;
+ rcu_assign_pointer(dev->ip_ptr, NULL);
devinet_sysctl_unregister(in_dev);
neigh_parms_release(&arp_tbl, in_dev->arp_parms);
@@ -403,6 +403,9 @@ static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa)
return inet_insert_ifa(ifa);
}
+/* Caller must hold RCU or RTNL :
+ * We dont take a reference on found in_device
+ */
struct in_device *inetdev_by_index(struct net *net, int ifindex)
{
struct net_device *dev;
@@ -411,7 +414,7 @@ struct in_device *inetdev_by_index(struct net *net, int ifindex)
rcu_read_lock();
dev = dev_get_by_index_rcu(net, ifindex);
if (dev)
- in_dev = in_dev_get(dev);
+ in_dev = rcu_dereference_rtnl(dev->ip_ptr);
rcu_read_unlock();
return in_dev;
}
@@ -453,8 +456,6 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg
goto errout;
}
- __in_dev_put(in_dev);
-
for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL;
ifap = &ifa->ifa_next) {
if (tb[IFA_LOCAL] &&
@@ -1059,7 +1060,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
switch (event) {
case NETDEV_REGISTER:
printk(KERN_DEBUG "inetdev_event: bug\n");
- dev->ip_ptr = NULL;
+ rcu_assign_pointer(dev->ip_ptr, NULL);
break;
case NETDEV_UP:
if (!inetdev_valid_mtu(dev->mtu))
@@ -1081,6 +1082,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
}
ip_mc_up(in_dev);
/* fall through */
+ case NETDEV_NOTIFY_PEERS:
case NETDEV_CHANGEADDR:
/* Send gratuitous ARP to notify of link change */
if (IN_DEV_ARP_NOTIFY(in_dev)) {
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 4f0ed458c883..eb6f69a8f27a 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -147,34 +147,43 @@ static void fib_flush(struct net *net)
rt_cache_flush(net, -1);
}
-/*
- * Find the first device with a given source address.
+/**
+ * __ip_dev_find - find the first device with a given source address.
+ * @net: the net namespace
+ * @addr: the source address
+ * @devref: if true, take a reference on the found device
+ *
+ * If a caller uses devref=false, it should be protected by RCU, or RTNL
*/
-
-struct net_device * ip_dev_find(struct net *net, __be32 addr)
+struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
{
- struct flowi fl = { .nl_u = { .ip4_u = { .daddr = addr } } };
- struct fib_result res;
+ struct flowi fl = {
+ .nl_u = {
+ .ip4_u = {
+ .daddr = addr
+ }
+ },
+ .flags = FLOWI_FLAG_MATCH_ANY_IIF
+ };
+ struct fib_result res = { 0 };
struct net_device *dev = NULL;
- struct fib_table *local_table;
-
-#ifdef CONFIG_IP_MULTIPLE_TABLES
- res.r = NULL;
-#endif
- local_table = fib_get_table(net, RT_TABLE_LOCAL);
- if (!local_table || fib_table_lookup(local_table, &fl, &res))
+ rcu_read_lock();
+ if (fib_lookup(net, &fl, &res)) {
+ rcu_read_unlock();
return NULL;
+ }
if (res.type != RTN_LOCAL)
goto out;
dev = FIB_RES_DEV(res);
- if (dev)
+ if (dev && devref)
dev_hold(dev);
out:
- fib_res_put(&res);
+ rcu_read_unlock();
return dev;
}
+EXPORT_SYMBOL(__ip_dev_find);
/*
* Find address type as if only "dev" was present in the system. If
@@ -201,11 +210,12 @@ static inline unsigned __inet_dev_addr_type(struct net *net,
local_table = fib_get_table(net, RT_TABLE_LOCAL);
if (local_table) {
ret = RTN_UNICAST;
- if (!fib_table_lookup(local_table, &fl, &res)) {
+ rcu_read_lock();
+ if (!fib_table_lookup(local_table, &fl, &res, FIB_LOOKUP_NOREF)) {
if (!dev || dev == res.fi->fib_dev)
ret = res.type;
- fib_res_put(&res);
}
+ rcu_read_unlock();
}
return ret;
}
@@ -214,40 +224,46 @@ unsigned int inet_addr_type(struct net *net, __be32 addr)
{
return __inet_dev_addr_type(net, NULL, addr);
}
+EXPORT_SYMBOL(inet_addr_type);
unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
__be32 addr)
{
- return __inet_dev_addr_type(net, dev, addr);
+ return __inet_dev_addr_type(net, dev, addr);
}
+EXPORT_SYMBOL(inet_dev_addr_type);
/* Given (packet source, input interface) and optional (dst, oif, tos):
- - (main) check, that source is valid i.e. not broadcast or our local
- address.
- - figure out what "logical" interface this packet arrived
- and calculate "specific destination" address.
- - check, that packet arrived from expected physical interface.
+ * - (main) check, that source is valid i.e. not broadcast or our local
+ * address.
+ * - figure out what "logical" interface this packet arrived
+ * and calculate "specific destination" address.
+ * - check, that packet arrived from expected physical interface.
+ * called with rcu_read_lock()
*/
-
int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
struct net_device *dev, __be32 *spec_dst,
u32 *itag, u32 mark)
{
struct in_device *in_dev;
- struct flowi fl = { .nl_u = { .ip4_u =
- { .daddr = src,
- .saddr = dst,
- .tos = tos } },
- .mark = mark,
- .iif = oif };
-
+ struct flowi fl = {
+ .nl_u = {
+ .ip4_u = {
+ .daddr = src,
+ .saddr = dst,
+ .tos = tos
+ }
+ },
+ .mark = mark,
+ .iif = oif
+ };
struct fib_result res;
int no_addr, rpf, accept_local;
+ bool dev_match;
int ret;
struct net *net;
no_addr = rpf = accept_local = 0;
- rcu_read_lock();
in_dev = __in_dev_get_rcu(dev);
if (in_dev) {
no_addr = in_dev->ifa_list == NULL;
@@ -256,7 +272,6 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
if (mark && !IN_DEV_SRC_VMARK(in_dev))
fl.mark = 0;
}
- rcu_read_unlock();
if (in_dev == NULL)
goto e_inval;
@@ -266,25 +281,33 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
goto last_resort;
if (res.type != RTN_UNICAST) {
if (res.type != RTN_LOCAL || !accept_local)
- goto e_inval_res;
+ goto e_inval;
}
*spec_dst = FIB_RES_PREFSRC(res);
fib_combine_itag(itag, &res);
+ dev_match = false;
+
#ifdef CONFIG_IP_ROUTE_MULTIPATH
- if (FIB_RES_DEV(res) == dev || res.fi->fib_nhs > 1)
+ for (ret = 0; ret < res.fi->fib_nhs; ret++) {
+ struct fib_nh *nh = &res.fi->fib_nh[ret];
+
+ if (nh->nh_dev == dev) {
+ dev_match = true;
+ break;
+ }
+ }
#else
if (FIB_RES_DEV(res) == dev)
+ dev_match = true;
#endif
- {
+ if (dev_match) {
ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
- fib_res_put(&res);
return ret;
}
- fib_res_put(&res);
if (no_addr)
goto last_resort;
if (rpf == 1)
- goto e_inval;
+ goto e_rpf;
fl.oif = dev->ifindex;
ret = 0;
@@ -293,21 +316,20 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
*spec_dst = FIB_RES_PREFSRC(res);
ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
}
- fib_res_put(&res);
}
return ret;
last_resort:
if (rpf)
- goto e_inval;
+ goto e_rpf;
*spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
*itag = 0;
return 0;
-e_inval_res:
- fib_res_put(&res);
e_inval:
return -EINVAL;
+e_rpf:
+ return -EXDEV;
}
static inline __be32 sk_extract_addr(struct sockaddr *addr)
@@ -456,9 +478,9 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
}
/*
- * Handle IP routing ioctl calls. These are used to manipulate the routing tables
+ * Handle IP routing ioctl calls.
+ * These are used to manipulate the routing tables
*/
-
int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg)
{
struct fib_config cfg;
@@ -502,7 +524,7 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg)
return -EINVAL;
}
-const struct nla_policy rtm_ipv4_policy[RTA_MAX+1] = {
+const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
[RTA_DST] = { .type = NLA_U32 },
[RTA_SRC] = { .type = NLA_U32 },
[RTA_IIF] = { .type = NLA_U32 },
@@ -516,7 +538,7 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX+1] = {
};
static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
- struct nlmsghdr *nlh, struct fib_config *cfg)
+ struct nlmsghdr *nlh, struct fib_config *cfg)
{
struct nlattr *attr;
int err, remaining;
@@ -671,12 +693,11 @@ out:
}
/* Prepare and feed intra-kernel routing request.
- Really, it should be netlink message, but :-( netlink
- can be not configured, so that we feed it directly
- to fib engine. It is legal, because all events occur
- only when netlink is already locked.
+ * Really, it should be netlink message, but :-( netlink
+ * can be not configured, so that we feed it directly
+ * to fib engine. It is legal, because all events occur
+ * only when netlink is already locked.
*/
-
static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa)
{
struct net *net = dev_net(ifa->ifa_dev->dev);
@@ -722,9 +743,9 @@ void fib_add_ifaddr(struct in_ifaddr *ifa)
struct in_ifaddr *prim = ifa;
__be32 mask = ifa->ifa_mask;
__be32 addr = ifa->ifa_local;
- __be32 prefix = ifa->ifa_address&mask;
+ __be32 prefix = ifa->ifa_address & mask;
- if (ifa->ifa_flags&IFA_F_SECONDARY) {
+ if (ifa->ifa_flags & IFA_F_SECONDARY) {
prim = inet_ifa_byprefix(in_dev, prefix, mask);
if (prim == NULL) {
printk(KERN_WARNING "fib_add_ifaddr: bug: prim == NULL\n");
@@ -734,22 +755,24 @@ void fib_add_ifaddr(struct in_ifaddr *ifa)
fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim);
- if (!(dev->flags&IFF_UP))
+ if (!(dev->flags & IFF_UP))
return;
/* Add broadcast address, if it is explicitly assigned. */
if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF))
fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
- if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags&IFA_F_SECONDARY) &&
+ if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) &&
(prefix != addr || ifa->ifa_prefixlen < 32)) {
- fib_magic(RTM_NEWROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL :
- RTN_UNICAST, prefix, ifa->ifa_prefixlen, prim);
+ fib_magic(RTM_NEWROUTE,
+ dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
+ prefix, ifa->ifa_prefixlen, prim);
/* Add network specific broadcasts, when it takes a sense */
if (ifa->ifa_prefixlen < 31) {
fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, prim);
- fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix|~mask, 32, prim);
+ fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask,
+ 32, prim);
}
}
}
@@ -760,17 +783,18 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
struct net_device *dev = in_dev->dev;
struct in_ifaddr *ifa1;
struct in_ifaddr *prim = ifa;
- __be32 brd = ifa->ifa_address|~ifa->ifa_mask;
- __be32 any = ifa->ifa_address&ifa->ifa_mask;
+ __be32 brd = ifa->ifa_address | ~ifa->ifa_mask;
+ __be32 any = ifa->ifa_address & ifa->ifa_mask;
#define LOCAL_OK 1
#define BRD_OK 2
#define BRD0_OK 4
#define BRD1_OK 8
unsigned ok = 0;
- if (!(ifa->ifa_flags&IFA_F_SECONDARY))
- fib_magic(RTM_DELROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL :
- RTN_UNICAST, any, ifa->ifa_prefixlen, prim);
+ if (!(ifa->ifa_flags & IFA_F_SECONDARY))
+ fib_magic(RTM_DELROUTE,
+ dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
+ any, ifa->ifa_prefixlen, prim);
else {
prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask);
if (prim == NULL) {
@@ -780,9 +804,9 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
}
/* Deletion is more complicated than add.
- We should take care of not to delete too much :-)
-
- Scan address list to be sure that addresses are really gone.
+ * We should take care of not to delete too much :-)
+ *
+ * Scan address list to be sure that addresses are really gone.
*/
for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) {
@@ -796,23 +820,23 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
ok |= BRD0_OK;
}
- if (!(ok&BRD_OK))
+ if (!(ok & BRD_OK))
fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
- if (!(ok&BRD1_OK))
+ if (!(ok & BRD1_OK))
fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim);
- if (!(ok&BRD0_OK))
+ if (!(ok & BRD0_OK))
fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim);
- if (!(ok&LOCAL_OK)) {
+ if (!(ok & LOCAL_OK)) {
fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim);
/* Check, that this local address finally disappeared. */
if (inet_addr_type(dev_net(dev), ifa->ifa_local) != RTN_LOCAL) {
/* And the last, but not the least thing.
- We must flush stray FIB entries.
-
- First of all, we scan fib_info list searching
- for stray nexthop entries, then ignite fib_flush.
- */
+ * We must flush stray FIB entries.
+ *
+ * First of all, we scan fib_info list searching
+ * for stray nexthop entries, then ignite fib_flush.
+ */
if (fib_sync_down_addr(dev_net(dev), ifa->ifa_local))
fib_flush(dev_net(dev));
}
@@ -823,14 +847,20 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
#undef BRD1_OK
}
-static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb )
+static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb)
{
struct fib_result res;
- struct flowi fl = { .mark = frn->fl_mark,
- .nl_u = { .ip4_u = { .daddr = frn->fl_addr,
- .tos = frn->fl_tos,
- .scope = frn->fl_scope } } };
+ struct flowi fl = {
+ .mark = frn->fl_mark,
+ .nl_u = {
+ .ip4_u = {
+ .daddr = frn->fl_addr,
+ .tos = frn->fl_tos,
+ .scope = frn->fl_scope
+ }
+ }
+ };
#ifdef CONFIG_IP_MULTIPLE_TABLES
res.r = NULL;
@@ -841,15 +871,16 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb )
local_bh_disable();
frn->tb_id = tb->tb_id;
- frn->err = fib_table_lookup(tb, &fl, &res);
+ rcu_read_lock();
+ frn->err = fib_table_lookup(tb, &fl, &res, FIB_LOOKUP_NOREF);
if (!frn->err) {
frn->prefixlen = res.prefixlen;
frn->nh_sel = res.nh_sel;
frn->type = res.type;
frn->scope = res.scope;
- fib_res_put(&res);
}
+ rcu_read_unlock();
local_bh_enable();
}
}
@@ -878,8 +909,8 @@ static void nl_fib_input(struct sk_buff *skb)
nl_fib_lookup(frn, tb);
- pid = NETLINK_CB(skb).pid; /* pid of sending process */
- NETLINK_CB(skb).pid = 0; /* from kernel */
+ pid = NETLINK_CB(skb).pid; /* pid of sending process */
+ NETLINK_CB(skb).pid = 0; /* from kernel */
NETLINK_CB(skb).dst_group = 0; /* unicast */
netlink_unicast(net->ipv4.fibnl, skb, pid, MSG_DONTWAIT);
}
@@ -926,7 +957,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
fib_del_ifaddr(ifa);
if (ifa->ifa_dev->ifa_list == NULL) {
/* Last address was deleted from this interface.
- Disable IP.
+ * Disable IP.
*/
fib_disable_ip(dev, 1, 0);
} else {
@@ -985,16 +1016,15 @@ static struct notifier_block fib_netdev_notifier = {
static int __net_init ip_fib_net_init(struct net *net)
{
int err;
- unsigned int i;
+ size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ;
- net->ipv4.fib_table_hash = kzalloc(
- sizeof(struct hlist_head)*FIB_TABLE_HASHSZ, GFP_KERNEL);
+ /* Avoid false sharing : Use at least a full cache line */
+ size = max_t(size_t, size, L1_CACHE_BYTES);
+
+ net->ipv4.fib_table_hash = kzalloc(size, GFP_KERNEL);
if (net->ipv4.fib_table_hash == NULL)
return -ENOMEM;
- for (i = 0; i < FIB_TABLE_HASHSZ; i++)
- INIT_HLIST_HEAD(&net->ipv4.fib_table_hash[i]);
-
err = fib4_rules_init(net);
if (err < 0)
goto fail;
@@ -1022,7 +1052,7 @@ static void ip_fib_net_exit(struct net *net)
hlist_for_each_entry_safe(tb, node, tmp, head, tb_hlist) {
hlist_del(node);
fib_table_flush(tb);
- kfree(tb);
+ fib_free_table(tb);
}
}
kfree(net->ipv4.fib_table_hash);
@@ -1075,7 +1105,3 @@ void __init ip_fib_init(void)
fib_hash_init();
}
-
-EXPORT_SYMBOL(inet_addr_type);
-EXPORT_SYMBOL(inet_dev_addr_type);
-EXPORT_SYMBOL(ip_dev_find);
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index 4ed7e0dea1bc..b3acb0417b21 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -54,36 +54,37 @@ struct fib_node {
struct fib_alias fn_embedded_alias;
};
-struct fn_zone {
- struct fn_zone *fz_next; /* Next not empty zone */
- struct hlist_head *fz_hash; /* Hash table pointer */
- int fz_nent; /* Number of entries */
+#define EMBEDDED_HASH_SIZE (L1_CACHE_BYTES / sizeof(struct hlist_head))
- int fz_divisor; /* Hash divisor */
+struct fn_zone {
+ struct fn_zone __rcu *fz_next; /* Next not empty zone */
+ struct hlist_head __rcu *fz_hash; /* Hash table pointer */
+ seqlock_t fz_lock;
u32 fz_hashmask; /* (fz_divisor - 1) */
-#define FZ_HASHMASK(fz) ((fz)->fz_hashmask)
- int fz_order; /* Zone order */
- __be32 fz_mask;
+ u8 fz_order; /* Zone order (0..32) */
+ u8 fz_revorder; /* 32 - fz_order */
+ __be32 fz_mask; /* inet_make_mask(order) */
#define FZ_MASK(fz) ((fz)->fz_mask)
-};
-/* NOTE. On fast computers evaluation of fz_hashmask and fz_mask
- * can be cheaper than memory lookup, so that FZ_* macros are used.
- */
+ struct hlist_head fz_embedded_hash[EMBEDDED_HASH_SIZE];
+
+ int fz_nent; /* Number of entries */
+ int fz_divisor; /* Hash size (mask+1) */
+};
struct fn_hash {
- struct fn_zone *fn_zones[33];
- struct fn_zone *fn_zone_list;
+ struct fn_zone *fn_zones[33];
+ struct fn_zone __rcu *fn_zone_list;
};
static inline u32 fn_hash(__be32 key, struct fn_zone *fz)
{
- u32 h = ntohl(key)>>(32 - fz->fz_order);
+ u32 h = ntohl(key) >> fz->fz_revorder;
h ^= (h>>20);
h ^= (h>>10);
h ^= (h>>5);
- h &= FZ_HASHMASK(fz);
+ h &= fz->fz_hashmask;
return h;
}
@@ -92,7 +93,6 @@ static inline __be32 fz_key(__be32 dst, struct fn_zone *fz)
return dst & FZ_MASK(fz);
}
-static DEFINE_RWLOCK(fib_hash_lock);
static unsigned int fib_hash_genid;
#define FZ_MAX_DIVISOR ((PAGE_SIZE<<MAX_ORDER) / sizeof(struct hlist_head))
@@ -101,12 +101,11 @@ static struct hlist_head *fz_hash_alloc(int divisor)
{
unsigned long size = divisor * sizeof(struct hlist_head);
- if (size <= PAGE_SIZE) {
+ if (size <= PAGE_SIZE)
return kzalloc(size, GFP_KERNEL);
- } else {
- return (struct hlist_head *)
- __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(size));
- }
+
+ return (struct hlist_head *)
+ __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(size));
}
/* The fib hash lock must be held when this is called. */
@@ -123,10 +122,11 @@ static inline void fn_rebuild_zone(struct fn_zone *fz,
hlist_for_each_entry_safe(f, node, n, &old_ht[i], fn_hash) {
struct hlist_head *new_head;
- hlist_del(&f->fn_hash);
+ hlist_del_rcu(&f->fn_hash);
- new_head = &fz->fz_hash[fn_hash(f->fn_key, fz)];
- hlist_add_head(&f->fn_hash, new_head);
+ new_head = rcu_dereference_protected(fz->fz_hash, 1) +
+ fn_hash(f->fn_key, fz);
+ hlist_add_head_rcu(&f->fn_hash, new_head);
}
}
}
@@ -147,14 +147,14 @@ static void fn_rehash_zone(struct fn_zone *fz)
int old_divisor, new_divisor;
u32 new_hashmask;
- old_divisor = fz->fz_divisor;
+ new_divisor = old_divisor = fz->fz_divisor;
switch (old_divisor) {
- case 16:
- new_divisor = 256;
+ case EMBEDDED_HASH_SIZE:
+ new_divisor *= EMBEDDED_HASH_SIZE;
break;
- case 256:
- new_divisor = 1024;
+ case EMBEDDED_HASH_SIZE*EMBEDDED_HASH_SIZE:
+ new_divisor *= (EMBEDDED_HASH_SIZE/2);
break;
default:
if ((old_divisor << 1) > FZ_MAX_DIVISOR) {
@@ -175,31 +175,55 @@ static void fn_rehash_zone(struct fn_zone *fz)
ht = fz_hash_alloc(new_divisor);
if (ht) {
- write_lock_bh(&fib_hash_lock);
- old_ht = fz->fz_hash;
- fz->fz_hash = ht;
+ struct fn_zone nfz;
+
+ memcpy(&nfz, fz, sizeof(nfz));
+
+ write_seqlock_bh(&fz->fz_lock);
+ old_ht = rcu_dereference_protected(fz->fz_hash, 1);
+ RCU_INIT_POINTER(nfz.fz_hash, ht);
+ nfz.fz_hashmask = new_hashmask;
+ nfz.fz_divisor = new_divisor;
+ fn_rebuild_zone(&nfz, old_ht, old_divisor);
+ fib_hash_genid++;
+ rcu_assign_pointer(fz->fz_hash, ht);
fz->fz_hashmask = new_hashmask;
fz->fz_divisor = new_divisor;
- fn_rebuild_zone(fz, old_ht, old_divisor);
- fib_hash_genid++;
- write_unlock_bh(&fib_hash_lock);
+ write_sequnlock_bh(&fz->fz_lock);
- fz_hash_free(old_ht, old_divisor);
+ if (old_ht != fz->fz_embedded_hash) {
+ synchronize_rcu();
+ fz_hash_free(old_ht, old_divisor);
+ }
}
}
-static inline void fn_free_node(struct fib_node * f)
+static void fn_free_node_rcu(struct rcu_head *head)
{
+ struct fib_node *f = container_of(head, struct fib_node, fn_embedded_alias.rcu);
+
kmem_cache_free(fn_hash_kmem, f);
}
+static inline void fn_free_node(struct fib_node *f)
+{
+ call_rcu(&f->fn_embedded_alias.rcu, fn_free_node_rcu);
+}
+
+static void fn_free_alias_rcu(struct rcu_head *head)
+{
+ struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
+
+ kmem_cache_free(fn_alias_kmem, fa);
+}
+
static inline void fn_free_alias(struct fib_alias *fa, struct fib_node *f)
{
fib_release_info(fa->fa_info);
if (fa == &f->fn_embedded_alias)
fa->fa_info = NULL;
else
- kmem_cache_free(fn_alias_kmem, fa);
+ call_rcu(&fa->rcu, fn_free_alias_rcu);
}
static struct fn_zone *
@@ -210,68 +234,71 @@ fn_new_zone(struct fn_hash *table, int z)
if (!fz)
return NULL;
- if (z) {
- fz->fz_divisor = 16;
- } else {
- fz->fz_divisor = 1;
- }
- fz->fz_hashmask = (fz->fz_divisor - 1);
- fz->fz_hash = fz_hash_alloc(fz->fz_divisor);
- if (!fz->fz_hash) {
- kfree(fz);
- return NULL;
- }
+ seqlock_init(&fz->fz_lock);
+ fz->fz_divisor = z ? EMBEDDED_HASH_SIZE : 1;
+ fz->fz_hashmask = fz->fz_divisor - 1;
+ RCU_INIT_POINTER(fz->fz_hash, fz->fz_embedded_hash);
fz->fz_order = z;
+ fz->fz_revorder = 32 - z;
fz->fz_mask = inet_make_mask(z);
/* Find the first not empty zone with more specific mask */
- for (i=z+1; i<=32; i++)
+ for (i = z + 1; i <= 32; i++)
if (table->fn_zones[i])
break;
- write_lock_bh(&fib_hash_lock);
- if (i>32) {
+ if (i > 32) {
/* No more specific masks, we are the first. */
- fz->fz_next = table->fn_zone_list;
- table->fn_zone_list = fz;
+ rcu_assign_pointer(fz->fz_next,
+ rtnl_dereference(table->fn_zone_list));
+ rcu_assign_pointer(table->fn_zone_list, fz);
} else {
- fz->fz_next = table->fn_zones[i]->fz_next;
- table->fn_zones[i]->fz_next = fz;
+ rcu_assign_pointer(fz->fz_next,
+ rtnl_dereference(table->fn_zones[i]->fz_next));
+ rcu_assign_pointer(table->fn_zones[i]->fz_next, fz);
}
table->fn_zones[z] = fz;
fib_hash_genid++;
- write_unlock_bh(&fib_hash_lock);
return fz;
}
int fib_table_lookup(struct fib_table *tb,
- const struct flowi *flp, struct fib_result *res)
+ const struct flowi *flp, struct fib_result *res,
+ int fib_flags)
{
int err;
struct fn_zone *fz;
struct fn_hash *t = (struct fn_hash *)tb->tb_data;
- read_lock(&fib_hash_lock);
- for (fz = t->fn_zone_list; fz; fz = fz->fz_next) {
+ rcu_read_lock();
+ for (fz = rcu_dereference(t->fn_zone_list);
+ fz != NULL;
+ fz = rcu_dereference(fz->fz_next)) {
struct hlist_head *head;
struct hlist_node *node;
struct fib_node *f;
- __be32 k = fz_key(flp->fl4_dst, fz);
+ __be32 k;
+ unsigned int seq;
- head = &fz->fz_hash[fn_hash(k, fz)];
- hlist_for_each_entry(f, node, head, fn_hash) {
- if (f->fn_key != k)
- continue;
+ do {
+ seq = read_seqbegin(&fz->fz_lock);
+ k = fz_key(flp->fl4_dst, fz);
+
+ head = rcu_dereference(fz->fz_hash) + fn_hash(k, fz);
+ hlist_for_each_entry_rcu(f, node, head, fn_hash) {
+ if (f->fn_key != k)
+ continue;
- err = fib_semantic_match(&f->fn_alias,
+ err = fib_semantic_match(&f->fn_alias,
flp, res,
- fz->fz_order);
- if (err <= 0)
- goto out;
- }
+ fz->fz_order, fib_flags);
+ if (err <= 0)
+ goto out;
+ }
+ } while (read_seqretry(&fz->fz_lock, seq));
}
err = 1;
out:
- read_unlock(&fib_hash_lock);
+ rcu_read_unlock();
return err;
}
@@ -285,6 +312,7 @@ void fib_table_select_default(struct fib_table *tb,
struct fib_info *last_resort;
struct fn_hash *t = (struct fn_hash *)tb->tb_data;
struct fn_zone *fz = t->fn_zones[0];
+ struct hlist_head *head;
if (fz == NULL)
return;
@@ -293,11 +321,12 @@ void fib_table_select_default(struct fib_table *tb,
last_resort = NULL;
order = -1;
- read_lock(&fib_hash_lock);
- hlist_for_each_entry(f, node, &fz->fz_hash[0], fn_hash) {
+ rcu_read_lock();
+ head = rcu_dereference(fz->fz_hash);
+ hlist_for_each_entry_rcu(f, node, head, fn_hash) {
struct fib_alias *fa;
- list_for_each_entry(fa, &f->fn_alias, fa_list) {
+ list_for_each_entry_rcu(fa, &f->fn_alias, fa_list) {
struct fib_info *next_fi = fa->fa_info;
if (fa->fa_scope != res->scope ||
@@ -309,7 +338,8 @@ void fib_table_select_default(struct fib_table *tb,
if (!next_fi->fib_nh[0].nh_gw ||
next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
continue;
- fa->fa_state |= FA_S_ACCESSED;
+
+ fib_alias_accessed(fa);
if (fi == NULL) {
if (next_fi != res->fi)
@@ -341,25 +371,25 @@ void fib_table_select_default(struct fib_table *tb,
fib_result_assign(res, last_resort);
tb->tb_default = last_idx;
out:
- read_unlock(&fib_hash_lock);
+ rcu_read_unlock();
}
/* Insert node F to FZ. */
static inline void fib_insert_node(struct fn_zone *fz, struct fib_node *f)
{
- struct hlist_head *head = &fz->fz_hash[fn_hash(f->fn_key, fz)];
+ struct hlist_head *head = rtnl_dereference(fz->fz_hash) + fn_hash(f->fn_key, fz);
- hlist_add_head(&f->fn_hash, head);
+ hlist_add_head_rcu(&f->fn_hash, head);
}
/* Return the node in FZ matching KEY. */
static struct fib_node *fib_find_node(struct fn_zone *fz, __be32 key)
{
- struct hlist_head *head = &fz->fz_hash[fn_hash(key, fz)];
+ struct hlist_head *head = rtnl_dereference(fz->fz_hash) + fn_hash(key, fz);
struct hlist_node *node;
struct fib_node *f;
- hlist_for_each_entry(f, node, head, fn_hash) {
+ hlist_for_each_entry_rcu(f, node, head, fn_hash) {
if (f->fn_key == key)
return f;
}
@@ -367,6 +397,17 @@ static struct fib_node *fib_find_node(struct fn_zone *fz, __be32 key)
return NULL;
}
+
+static struct fib_alias *fib_fast_alloc(struct fib_node *f)
+{
+ struct fib_alias *fa = &f->fn_embedded_alias;
+
+ if (fa->fa_info != NULL)
+ fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
+ return fa;
+}
+
+/* Caller must hold RTNL. */
int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
{
struct fn_hash *table = (struct fn_hash *) tb->tb_data;
@@ -451,7 +492,6 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
}
if (cfg->fc_nlflags & NLM_F_REPLACE) {
- struct fib_info *fi_drop;
u8 state;
fa = fa_first;
@@ -460,21 +500,25 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
err = 0;
goto out;
}
- write_lock_bh(&fib_hash_lock);
- fi_drop = fa->fa_info;
- fa->fa_info = fi;
- fa->fa_type = cfg->fc_type;
- fa->fa_scope = cfg->fc_scope;
+ err = -ENOBUFS;
+ new_fa = fib_fast_alloc(f);
+ if (new_fa == NULL)
+ goto out;
+
+ new_fa->fa_tos = fa->fa_tos;
+ new_fa->fa_info = fi;
+ new_fa->fa_type = cfg->fc_type;
+ new_fa->fa_scope = cfg->fc_scope;
state = fa->fa_state;
- fa->fa_state &= ~FA_S_ACCESSED;
+ new_fa->fa_state = state & ~FA_S_ACCESSED;
fib_hash_genid++;
- write_unlock_bh(&fib_hash_lock);
+ list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
- fib_release_info(fi_drop);
+ fn_free_alias(fa, f);
if (state & FA_S_ACCESSED)
rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
- rtmsg_fib(RTM_NEWROUTE, key, fa, cfg->fc_dst_len, tb->tb_id,
- &cfg->fc_nlinfo, NLM_F_REPLACE);
+ rtmsg_fib(RTM_NEWROUTE, key, new_fa, cfg->fc_dst_len,
+ tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
return 0;
}
@@ -506,12 +550,10 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
f = new_f;
}
- new_fa = &f->fn_embedded_alias;
- if (new_fa->fa_info != NULL) {
- new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
- if (new_fa == NULL)
- goto out;
- }
+ new_fa = fib_fast_alloc(f);
+ if (new_fa == NULL)
+ goto out;
+
new_fa->fa_info = fi;
new_fa->fa_tos = tos;
new_fa->fa_type = cfg->fc_type;
@@ -522,13 +564,11 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
* Insert new entry to the list.
*/
- write_lock_bh(&fib_hash_lock);
if (new_f)
fib_insert_node(fz, new_f);
- list_add_tail(&new_fa->fa_list,
+ list_add_tail_rcu(&new_fa->fa_list,
(fa ? &fa->fa_list : &f->fn_alias));
fib_hash_genid++;
- write_unlock_bh(&fib_hash_lock);
if (new_f)
fz->fz_nent++;
@@ -603,14 +643,12 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
tb->tb_id, &cfg->fc_nlinfo, 0);
kill_fn = 0;
- write_lock_bh(&fib_hash_lock);
- list_del(&fa->fa_list);
+ list_del_rcu(&fa->fa_list);
if (list_empty(&f->fn_alias)) {
- hlist_del(&f->fn_hash);
+ hlist_del_rcu(&f->fn_hash);
kill_fn = 1;
}
fib_hash_genid++;
- write_unlock_bh(&fib_hash_lock);
if (fa->fa_state & FA_S_ACCESSED)
rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
@@ -627,7 +665,7 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
static int fn_flush_list(struct fn_zone *fz, int idx)
{
- struct hlist_head *head = &fz->fz_hash[idx];
+ struct hlist_head *head = rtnl_dereference(fz->fz_hash) + idx;
struct hlist_node *node, *n;
struct fib_node *f;
int found = 0;
@@ -641,14 +679,12 @@ static int fn_flush_list(struct fn_zone *fz, int idx)
struct fib_info *fi = fa->fa_info;
if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
- write_lock_bh(&fib_hash_lock);
- list_del(&fa->fa_list);
+ list_del_rcu(&fa->fa_list);
if (list_empty(&f->fn_alias)) {
- hlist_del(&f->fn_hash);
+ hlist_del_rcu(&f->fn_hash);
kill_f = 1;
}
fib_hash_genid++;
- write_unlock_bh(&fib_hash_lock);
fn_free_alias(fa, f);
found++;
@@ -662,13 +698,16 @@ static int fn_flush_list(struct fn_zone *fz, int idx)
return found;
}
+/* caller must hold RTNL. */
int fib_table_flush(struct fib_table *tb)
{
struct fn_hash *table = (struct fn_hash *) tb->tb_data;
struct fn_zone *fz;
int found = 0;
- for (fz = table->fn_zone_list; fz; fz = fz->fz_next) {
+ for (fz = rtnl_dereference(table->fn_zone_list);
+ fz != NULL;
+ fz = rtnl_dereference(fz->fz_next)) {
int i;
for (i = fz->fz_divisor - 1; i >= 0; i--)
@@ -677,6 +716,24 @@ int fib_table_flush(struct fib_table *tb)
return found;
}
+void fib_free_table(struct fib_table *tb)
+{
+ struct fn_hash *table = (struct fn_hash *) tb->tb_data;
+ struct fn_zone *fz, *next;
+
+ next = table->fn_zone_list;
+ while (next != NULL) {
+ fz = next;
+ next = fz->fz_next;
+
+ if (fz->fz_hash != fz->fz_embedded_hash)
+ fz_hash_free(fz->fz_hash, fz->fz_divisor);
+
+ kfree(fz);
+ }
+
+ kfree(tb);
+}
static inline int
fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb,
@@ -690,10 +747,10 @@ fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb,
s_i = cb->args[4];
i = 0;
- hlist_for_each_entry(f, node, head, fn_hash) {
+ hlist_for_each_entry_rcu(f, node, head, fn_hash) {
struct fib_alias *fa;
- list_for_each_entry(fa, &f->fn_alias, fa_list) {
+ list_for_each_entry_rcu(fa, &f->fn_alias, fa_list) {
if (i < s_i)
goto next;
@@ -711,7 +768,7 @@ fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb,
cb->args[4] = i;
return -1;
}
- next:
+next:
i++;
}
}
@@ -725,14 +782,15 @@ fn_hash_dump_zone(struct sk_buff *skb, struct netlink_callback *cb,
struct fn_zone *fz)
{
int h, s_h;
+ struct hlist_head *head = rcu_dereference(fz->fz_hash);
- if (fz->fz_hash == NULL)
+ if (head == NULL)
return skb->len;
s_h = cb->args[3];
for (h = s_h; h < fz->fz_divisor; h++) {
- if (hlist_empty(&fz->fz_hash[h]))
+ if (hlist_empty(head + h))
continue;
- if (fn_hash_dump_bucket(skb, cb, tb, fz, &fz->fz_hash[h]) < 0) {
+ if (fn_hash_dump_bucket(skb, cb, tb, fz, head + h) < 0) {
cb->args[3] = h;
return -1;
}
@@ -746,23 +804,26 @@ fn_hash_dump_zone(struct sk_buff *skb, struct netlink_callback *cb,
int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
struct netlink_callback *cb)
{
- int m, s_m;
+ int m = 0, s_m;
struct fn_zone *fz;
struct fn_hash *table = (struct fn_hash *)tb->tb_data;
s_m = cb->args[2];
- read_lock(&fib_hash_lock);
- for (fz = table->fn_zone_list, m=0; fz; fz = fz->fz_next, m++) {
- if (m < s_m) continue;
+ rcu_read_lock();
+ for (fz = rcu_dereference(table->fn_zone_list);
+ fz != NULL;
+ fz = rcu_dereference(fz->fz_next), m++) {
+ if (m < s_m)
+ continue;
if (fn_hash_dump_zone(skb, cb, tb, fz) < 0) {
cb->args[2] = m;
- read_unlock(&fib_hash_lock);
+ rcu_read_unlock();
return -1;
}
memset(&cb->args[3], 0,
sizeof(cb->args) - 3*sizeof(cb->args[0]));
}
- read_unlock(&fib_hash_lock);
+ rcu_read_unlock();
cb->args[2] = m;
return skb->len;
}
@@ -825,14 +886,15 @@ static struct fib_alias *fib_get_first(struct seq_file *seq)
iter->genid = fib_hash_genid;
iter->valid = 1;
- for (iter->zone = table->fn_zone_list; iter->zone;
- iter->zone = iter->zone->fz_next) {
+ for (iter->zone = rcu_dereference(table->fn_zone_list);
+ iter->zone != NULL;
+ iter->zone = rcu_dereference(iter->zone->fz_next)) {
int maxslot;
if (!iter->zone->fz_nent)
continue;
- iter->hash_head = iter->zone->fz_hash;
+ iter->hash_head = rcu_dereference(iter->zone->fz_hash);
maxslot = iter->zone->fz_divisor;
for (iter->bucket = 0; iter->bucket < maxslot;
@@ -911,13 +973,13 @@ static struct fib_alias *fib_get_next(struct seq_file *seq)
}
}
- iter->zone = iter->zone->fz_next;
+ iter->zone = rcu_dereference(iter->zone->fz_next);
if (!iter->zone)
goto out;
iter->bucket = 0;
- iter->hash_head = iter->zone->fz_hash;
+ iter->hash_head = rcu_dereference(iter->zone->fz_hash);
hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) {
list_for_each_entry(fa, &fn->fn_alias, fa_list) {
@@ -950,11 +1012,11 @@ static struct fib_alias *fib_get_idx(struct seq_file *seq, loff_t pos)
}
static void *fib_seq_start(struct seq_file *seq, loff_t *pos)
- __acquires(fib_hash_lock)
+ __acquires(RCU)
{
void *v = NULL;
- read_lock(&fib_hash_lock);
+ rcu_read_lock();
if (fib_get_table(seq_file_net(seq), RT_TABLE_MAIN))
v = *pos ? fib_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
return v;
@@ -967,15 +1029,16 @@ static void *fib_seq_next(struct seq_file *seq, void *v, loff_t *pos)
}
static void fib_seq_stop(struct seq_file *seq, void *v)
- __releases(fib_hash_lock)
+ __releases(RCU)
{
- read_unlock(&fib_hash_lock);
+ rcu_read_unlock();
}
static unsigned fib_flag_trans(int type, __be32 mask, struct fib_info *fi)
{
static const unsigned type2flags[RTN_MAX + 1] = {
- [7] = RTF_REJECT, [8] = RTF_REJECT,
+ [7] = RTF_REJECT,
+ [8] = RTF_REJECT,
};
unsigned flags = type2flags[type];
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index 637b133973bd..a29edf2219c8 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -12,17 +12,22 @@ struct fib_alias {
u8 fa_type;
u8 fa_scope;
u8 fa_state;
-#ifdef CONFIG_IP_FIB_TRIE
struct rcu_head rcu;
-#endif
};
#define FA_S_ACCESSED 0x01
+/* Dont write on fa_state unless needed, to keep it shared on all cpus */
+static inline void fib_alias_accessed(struct fib_alias *fa)
+{
+ if (!(fa->fa_state & FA_S_ACCESSED))
+ fa->fa_state |= FA_S_ACCESSED;
+}
+
/* Exported by fib_semantics.c */
extern int fib_semantic_match(struct list_head *head,
const struct flowi *flp,
- struct fib_result *res, int prefixlen);
+ struct fib_result *res, int prefixlen, int fib_flags);
extern void fib_release_info(struct fib_info *);
extern struct fib_info *fib_create_info(struct fib_config *cfg);
extern int fib_nh_match(struct fib_config *cfg, struct fib_info *fi);
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 76daeb5ff564..7981a24f5c7b 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -6,7 +6,7 @@
* IPv4 Forwarding Information Base: policy rules.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
- * Thomas Graf <tgraf@suug.ch>
+ * Thomas Graf <tgraf@suug.ch>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -14,7 +14,7 @@
* 2 of the License, or (at your option) any later version.
*
* Fixes:
- * Rani Assaf : local_rule cannot be deleted
+ * Rani Assaf : local_rule cannot be deleted
* Marc Boucher : routing by fwmark
*/
@@ -32,8 +32,7 @@
#include <net/ip_fib.h>
#include <net/fib_rules.h>
-struct fib4_rule
-{
+struct fib4_rule {
struct fib_rule common;
u8 dst_len;
u8 src_len;
@@ -58,6 +57,7 @@ int fib_lookup(struct net *net, struct flowi *flp, struct fib_result *res)
{
struct fib_lookup_arg arg = {
.result = res,
+ .flags = FIB_LOOKUP_NOREF,
};
int err;
@@ -91,10 +91,11 @@ static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp,
goto errout;
}
- if ((tbl = fib_get_table(rule->fr_net, rule->table)) == NULL)
+ tbl = fib_get_table(rule->fr_net, rule->table);
+ if (!tbl)
goto errout;
- err = fib_table_lookup(tbl, flp, (struct fib_result *) arg->result);
+ err = fib_table_lookup(tbl, flp, (struct fib_result *) arg->result, arg->flags);
if (err > 0)
err = -EAGAIN;
errout:
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 20f09c5b31e8..3e0da3ef6116 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -60,21 +60,30 @@ static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
static DEFINE_SPINLOCK(fib_multipath_lock);
-#define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \
-for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
-
-#define change_nexthops(fi) { int nhsel; struct fib_nh *nexthop_nh; \
-for (nhsel=0, nexthop_nh = (struct fib_nh *)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nexthop_nh++, nhsel++)
+#define for_nexthops(fi) { \
+ int nhsel; const struct fib_nh *nh; \
+ for (nhsel = 0, nh = (fi)->fib_nh; \
+ nhsel < (fi)->fib_nhs; \
+ nh++, nhsel++)
+
+#define change_nexthops(fi) { \
+ int nhsel; struct fib_nh *nexthop_nh; \
+ for (nhsel = 0, nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \
+ nhsel < (fi)->fib_nhs; \
+ nexthop_nh++, nhsel++)
#else /* CONFIG_IP_ROUTE_MULTIPATH */
/* Hope, that gcc will optimize it to get rid of dummy loop */
-#define for_nexthops(fi) { int nhsel = 0; const struct fib_nh * nh = (fi)->fib_nh; \
-for (nhsel=0; nhsel < 1; nhsel++)
+#define for_nexthops(fi) { \
+ int nhsel; const struct fib_nh *nh = (fi)->fib_nh; \
+ for (nhsel = 0; nhsel < 1; nhsel++)
-#define change_nexthops(fi) { int nhsel = 0; struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \
-for (nhsel=0; nhsel < 1; nhsel++)
+#define change_nexthops(fi) { \
+ int nhsel; \
+ struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \
+ for (nhsel = 0; nhsel < 1; nhsel++)
#endif /* CONFIG_IP_ROUTE_MULTIPATH */
@@ -86,63 +95,70 @@ static const struct
int error;
u8 scope;
} fib_props[RTN_MAX + 1] = {
- {
+ [RTN_UNSPEC] = {
.error = 0,
.scope = RT_SCOPE_NOWHERE,
- }, /* RTN_UNSPEC */
- {
+ },
+ [RTN_UNICAST] = {
.error = 0,
.scope = RT_SCOPE_UNIVERSE,
- }, /* RTN_UNICAST */
- {
+ },
+ [RTN_LOCAL] = {
.error = 0,
.scope = RT_SCOPE_HOST,
- }, /* RTN_LOCAL */
- {
+ },
+ [RTN_BROADCAST] = {
.error = 0,
.scope = RT_SCOPE_LINK,
- }, /* RTN_BROADCAST */
- {
+ },
+ [RTN_ANYCAST] = {
.error = 0,
.scope = RT_SCOPE_LINK,
- }, /* RTN_ANYCAST */
- {
+ },
+ [RTN_MULTICAST] = {
.error = 0,
.scope = RT_SCOPE_UNIVERSE,
- }, /* RTN_MULTICAST */
- {
+ },
+ [RTN_BLACKHOLE] = {
.error = -EINVAL,
.scope = RT_SCOPE_UNIVERSE,
- }, /* RTN_BLACKHOLE */
- {
+ },
+ [RTN_UNREACHABLE] = {
.error = -EHOSTUNREACH,
.scope = RT_SCOPE_UNIVERSE,
- }, /* RTN_UNREACHABLE */
- {
+ },
+ [RTN_PROHIBIT] = {
.error = -EACCES,
.scope = RT_SCOPE_UNIVERSE,
- }, /* RTN_PROHIBIT */
- {
+ },
+ [RTN_THROW] = {
.error = -EAGAIN,
.scope = RT_SCOPE_UNIVERSE,
- }, /* RTN_THROW */
- {
+ },
+ [RTN_NAT] = {
.error = -EINVAL,
.scope = RT_SCOPE_NOWHERE,
- }, /* RTN_NAT */
- {
+ },
+ [RTN_XRESOLVE] = {
.error = -EINVAL,
.scope = RT_SCOPE_NOWHERE,
- }, /* RTN_XRESOLVE */
+ },
};
/* Release a nexthop info record */
+static void free_fib_info_rcu(struct rcu_head *head)
+{
+ struct fib_info *fi = container_of(head, struct fib_info, rcu);
+
+ kfree(fi);
+}
+
void free_fib_info(struct fib_info *fi)
{
if (fi->fib_dead == 0) {
- printk(KERN_WARNING "Freeing alive fib_info %p\n", fi);
+ pr_warning("Freeing alive fib_info %p\n", fi);
return;
}
change_nexthops(fi) {
@@ -152,7 +168,7 @@ void free_fib_info(struct fib_info *fi)
} endfor_nexthops(fi);
fib_info_cnt--;
release_net(fi->fib_net);
- kfree(fi);
+ call_rcu(&fi->rcu, free_fib_info_rcu);
}
void fib_release_info(struct fib_info *fi)
@@ -173,7 +189,7 @@ void fib_release_info(struct fib_info *fi)
spin_unlock_bh(&fib_info_lock);
}
-static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
+static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
{
const struct fib_nh *onh = ofi->fib_nh;
@@ -187,7 +203,7 @@ static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *
#ifdef CONFIG_NET_CLS_ROUTE
nh->nh_tclassid != onh->nh_tclassid ||
#endif
- ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD))
+ ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD))
return -1;
onh++;
} endfor_nexthops(fi);
@@ -238,7 +254,7 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi)
nfi->fib_priority == fi->fib_priority &&
memcmp(nfi->fib_metrics, fi->fib_metrics,
sizeof(fi->fib_metrics)) == 0 &&
- ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 &&
+ ((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_F_DEAD) == 0 &&
(nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
return fi;
}
@@ -247,9 +263,8 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi)
}
/* Check, that the gateway is already configured.
- Used only by redirect accept routine.
+ * Used only by redirect accept routine.
*/
-
int ip_fib_check_default(__be32 gw, struct net_device *dev)
{
struct hlist_head *head;
@@ -264,7 +279,7 @@ int ip_fib_check_default(__be32 gw, struct net_device *dev)
hlist_for_each_entry(nh, node, head, nh_hash) {
if (nh->nh_dev == dev &&
nh->nh_gw == gw &&
- !(nh->nh_flags&RTNH_F_DEAD)) {
+ !(nh->nh_flags & RTNH_F_DEAD)) {
spin_unlock(&fib_info_lock);
return 0;
}
@@ -362,10 +377,10 @@ int fib_detect_death(struct fib_info *fi, int order,
}
if (state == NUD_REACHABLE)
return 0;
- if ((state&NUD_VALID) && order != dflt)
+ if ((state & NUD_VALID) && order != dflt)
return 0;
- if ((state&NUD_VALID) ||
- (*last_idx<0 && order > dflt)) {
+ if ((state & NUD_VALID) ||
+ (*last_idx < 0 && order > dflt)) {
*last_resort = fi;
*last_idx = order;
}
@@ -476,75 +491,76 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
/*
- Picture
- -------
-
- Semantics of nexthop is very messy by historical reasons.
- We have to take into account, that:
- a) gateway can be actually local interface address,
- so that gatewayed route is direct.
- b) gateway must be on-link address, possibly
- described not by an ifaddr, but also by a direct route.
- c) If both gateway and interface are specified, they should not
- contradict.
- d) If we use tunnel routes, gateway could be not on-link.
-
- Attempt to reconcile all of these (alas, self-contradictory) conditions
- results in pretty ugly and hairy code with obscure logic.
-
- I chose to generalized it instead, so that the size
- of code does not increase practically, but it becomes
- much more general.
- Every prefix is assigned a "scope" value: "host" is local address,
- "link" is direct route,
- [ ... "site" ... "interior" ... ]
- and "universe" is true gateway route with global meaning.
-
- Every prefix refers to a set of "nexthop"s (gw, oif),
- where gw must have narrower scope. This recursion stops
- when gw has LOCAL scope or if "nexthop" is declared ONLINK,
- which means that gw is forced to be on link.
-
- Code is still hairy, but now it is apparently logically
- consistent and very flexible. F.e. as by-product it allows
- to co-exists in peace independent exterior and interior
- routing processes.
-
- Normally it looks as following.
-
- {universe prefix} -> (gw, oif) [scope link]
- |
- |-> {link prefix} -> (gw, oif) [scope local]
- |
- |-> {local prefix} (terminal node)
+ * Picture
+ * -------
+ *
+ * Semantics of nexthop is very messy by historical reasons.
+ * We have to take into account, that:
+ * a) gateway can be actually local interface address,
+ * so that gatewayed route is direct.
+ * b) gateway must be on-link address, possibly
+ * described not by an ifaddr, but also by a direct route.
+ * c) If both gateway and interface are specified, they should not
+ * contradict.
+ * d) If we use tunnel routes, gateway could be not on-link.
+ *
+ * Attempt to reconcile all of these (alas, self-contradictory) conditions
+ * results in pretty ugly and hairy code with obscure logic.
+ *
+ * I chose to generalized it instead, so that the size
+ * of code does not increase practically, but it becomes
+ * much more general.
+ * Every prefix is assigned a "scope" value: "host" is local address,
+ * "link" is direct route,
+ * [ ... "site" ... "interior" ... ]
+ * and "universe" is true gateway route with global meaning.
+ *
+ * Every prefix refers to a set of "nexthop"s (gw, oif),
+ * where gw must have narrower scope. This recursion stops
+ * when gw has LOCAL scope or if "nexthop" is declared ONLINK,
+ * which means that gw is forced to be on link.
+ *
+ * Code is still hairy, but now it is apparently logically
+ * consistent and very flexible. F.e. as by-product it allows
+ * to co-exists in peace independent exterior and interior
+ * routing processes.
+ *
+ * Normally it looks as following.
+ *
+ * {universe prefix} -> (gw, oif) [scope link]
+ * |
+ * |-> {link prefix} -> (gw, oif) [scope local]
+ * |
+ * |-> {local prefix} (terminal node)
*/
-
static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
struct fib_nh *nh)
{
int err;
struct net *net;
+ struct net_device *dev;
net = cfg->fc_nlinfo.nl_net;
if (nh->nh_gw) {
struct fib_result res;
- if (nh->nh_flags&RTNH_F_ONLINK) {
- struct net_device *dev;
+ if (nh->nh_flags & RTNH_F_ONLINK) {
if (cfg->fc_scope >= RT_SCOPE_LINK)
return -EINVAL;
if (inet_addr_type(net, nh->nh_gw) != RTN_UNICAST)
return -EINVAL;
- if ((dev = __dev_get_by_index(net, nh->nh_oif)) == NULL)
+ dev = __dev_get_by_index(net, nh->nh_oif);
+ if (!dev)
return -ENODEV;
- if (!(dev->flags&IFF_UP))
+ if (!(dev->flags & IFF_UP))
return -ENETDOWN;
nh->nh_dev = dev;
dev_hold(dev);
nh->nh_scope = RT_SCOPE_LINK;
return 0;
}
+ rcu_read_lock();
{
struct flowi fl = {
.nl_u = {
@@ -559,50 +575,53 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
/* It is not necessary, but requires a bit of thinking */
if (fl.fl4_scope < RT_SCOPE_LINK)
fl.fl4_scope = RT_SCOPE_LINK;
- if ((err = fib_lookup(net, &fl, &res)) != 0)
+ err = fib_lookup(net, &fl, &res);
+ if (err) {
+ rcu_read_unlock();
return err;
+ }
}
err = -EINVAL;
if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
goto out;
nh->nh_scope = res.scope;
nh->nh_oif = FIB_RES_OIF(res);
- if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL)
+ nh->nh_dev = dev = FIB_RES_DEV(res);
+ if (!dev)
goto out;
- dev_hold(nh->nh_dev);
- err = -ENETDOWN;
- if (!(nh->nh_dev->flags & IFF_UP))
- goto out;
- err = 0;
-out:
- fib_res_put(&res);
- return err;
+ dev_hold(dev);
+ err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN;
} else {
struct in_device *in_dev;
- if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK))
+ if (nh->nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK))
return -EINVAL;
+ rcu_read_lock();
+ err = -ENODEV;
in_dev = inetdev_by_index(net, nh->nh_oif);
if (in_dev == NULL)
- return -ENODEV;
- if (!(in_dev->dev->flags&IFF_UP)) {
- in_dev_put(in_dev);
- return -ENETDOWN;
- }
+ goto out;
+ err = -ENETDOWN;
+ if (!(in_dev->dev->flags & IFF_UP))
+ goto out;
nh->nh_dev = in_dev->dev;
dev_hold(nh->nh_dev);
nh->nh_scope = RT_SCOPE_HOST;
- in_dev_put(in_dev);
+ err = 0;
}
- return 0;
+out:
+ rcu_read_unlock();
+ return err;
}
static inline unsigned int fib_laddr_hashfn(__be32 val)
{
unsigned int mask = (fib_hash_size - 1);
- return ((__force u32)val ^ ((__force u32)val >> 7) ^ ((__force u32)val >> 14)) & mask;
+ return ((__force u32)val ^
+ ((__force u32)val >> 7) ^
+ ((__force u32)val >> 14)) & mask;
}
static struct hlist_head *fib_hash_alloc(int bytes)
@@ -611,7 +630,8 @@ static struct hlist_head *fib_hash_alloc(int bytes)
return kzalloc(bytes, GFP_KERNEL);
else
return (struct hlist_head *)
- __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(bytes));
+ __get_free_pages(GFP_KERNEL | __GFP_ZERO,
+ get_order(bytes));
}
static void fib_hash_free(struct hlist_head *hash, int bytes)
@@ -806,7 +826,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
goto failure;
} else {
change_nexthops(fi) {
- if ((err = fib_check_nh(cfg, fi, nexthop_nh)) != 0)
+ err = fib_check_nh(cfg, fi, nexthop_nh);
+ if (err != 0)
goto failure;
} endfor_nexthops(fi)
}
@@ -819,7 +840,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
}
link_it:
- if ((ofi = fib_find_info(fi)) != NULL) {
+ ofi = fib_find_info(fi);
+ if (ofi) {
fi->fib_dead = 1;
free_fib_info(fi);
ofi->fib_treeref++;
@@ -864,7 +886,7 @@ failure:
/* Note! fib_semantic_match intentionally uses RCU list functions. */
int fib_semantic_match(struct list_head *head, const struct flowi *flp,
- struct fib_result *res, int prefixlen)
+ struct fib_result *res, int prefixlen, int fib_flags)
{
struct fib_alias *fa;
int nh_sel = 0;
@@ -879,7 +901,7 @@ int fib_semantic_match(struct list_head *head, const struct flowi *flp,
if (fa->fa_scope < flp->fl4_scope)
continue;
- fa->fa_state |= FA_S_ACCESSED;
+ fib_alias_accessed(fa);
err = fib_props[fa->fa_type].error;
if (err == 0) {
@@ -895,7 +917,7 @@ int fib_semantic_match(struct list_head *head, const struct flowi *flp,
case RTN_ANYCAST:
case RTN_MULTICAST:
for_nexthops(fi) {
- if (nh->nh_flags&RTNH_F_DEAD)
+ if (nh->nh_flags & RTNH_F_DEAD)
continue;
if (!flp->oif || flp->oif == nh->nh_oif)
break;
@@ -906,16 +928,15 @@ int fib_semantic_match(struct list_head *head, const struct flowi *flp,
goto out_fill_res;
}
#else
- if (nhsel < 1) {
+ if (nhsel < 1)
goto out_fill_res;
- }
#endif
endfor_nexthops(fi);
continue;
default:
- printk(KERN_WARNING "fib_semantic_match bad type %#x\n",
- fa->fa_type);
+ pr_warning("fib_semantic_match bad type %#x\n",
+ fa->fa_type);
return -EINVAL;
}
}
@@ -929,7 +950,8 @@ out_fill_res:
res->type = fa->fa_type;
res->scope = fa->fa_scope;
res->fi = fa->fa_info;
- atomic_inc(&res->fi->fib_clntref);
+ if (!(fib_flags & FIB_LOOKUP_NOREF))
+ atomic_inc(&res->fi->fib_clntref);
return 0;
}
@@ -1028,10 +1050,10 @@ nla_put_failure:
}
/*
- Update FIB if:
- - local address disappeared -> we must delete all the entries
- referring to it.
- - device went down -> we must shutdown all nexthops going via it.
+ * Update FIB if:
+ * - local address disappeared -> we must delete all the entries
+ * referring to it.
+ * - device went down -> we must shutdown all nexthops going via it.
*/
int fib_sync_down_addr(struct net *net, __be32 local)
{
@@ -1078,7 +1100,7 @@ int fib_sync_down_dev(struct net_device *dev, int force)
prev_fi = fi;
dead = 0;
change_nexthops(fi) {
- if (nexthop_nh->nh_flags&RTNH_F_DEAD)
+ if (nexthop_nh->nh_flags & RTNH_F_DEAD)
dead++;
else if (nexthop_nh->nh_dev == dev &&
nexthop_nh->nh_scope != scope) {
@@ -1110,10 +1132,9 @@ int fib_sync_down_dev(struct net_device *dev, int force)
#ifdef CONFIG_IP_ROUTE_MULTIPATH
/*
- Dead device goes up. We wake up dead nexthops.
- It takes sense only on multipath routes.
+ * Dead device goes up. We wake up dead nexthops.
+ * It takes sense only on multipath routes.
*/
-
int fib_sync_up(struct net_device *dev)
{
struct fib_info *prev_fi;
@@ -1123,7 +1144,7 @@ int fib_sync_up(struct net_device *dev)
struct fib_nh *nh;
int ret;
- if (!(dev->flags&IFF_UP))
+ if (!(dev->flags & IFF_UP))
return 0;
prev_fi = NULL;
@@ -1142,12 +1163,12 @@ int fib_sync_up(struct net_device *dev)
prev_fi = fi;
alive = 0;
change_nexthops(fi) {
- if (!(nexthop_nh->nh_flags&RTNH_F_DEAD)) {
+ if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) {
alive++;
continue;
}
if (nexthop_nh->nh_dev == NULL ||
- !(nexthop_nh->nh_dev->flags&IFF_UP))
+ !(nexthop_nh->nh_dev->flags & IFF_UP))
continue;
if (nexthop_nh->nh_dev != dev ||
!__in_dev_get_rtnl(dev))
@@ -1169,10 +1190,9 @@ int fib_sync_up(struct net_device *dev)
}
/*
- The algorithm is suboptimal, but it provides really
- fair weighted route distribution.
+ * The algorithm is suboptimal, but it provides really
+ * fair weighted route distribution.
*/
-
void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
{
struct fib_info *fi = res->fi;
@@ -1182,7 +1202,7 @@ void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
if (fi->fib_power <= 0) {
int power = 0;
change_nexthops(fi) {
- if (!(nexthop_nh->nh_flags&RTNH_F_DEAD)) {
+ if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) {
power += nexthop_nh->nh_weight;
nexthop_nh->nh_power = nexthop_nh->nh_weight;
}
@@ -1198,15 +1218,16 @@ void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
/* w should be random number [0..fi->fib_power-1],
- it is pretty bad approximation.
+ * it is pretty bad approximation.
*/
w = jiffies % fi->fib_power;
change_nexthops(fi) {
- if (!(nexthop_nh->nh_flags&RTNH_F_DEAD) &&
+ if (!(nexthop_nh->nh_flags & RTNH_F_DEAD) &&
nexthop_nh->nh_power) {
- if ((w -= nexthop_nh->nh_power) <= 0) {
+ w -= nexthop_nh->nh_power;
+ if (w <= 0) {
nexthop_nh->nh_power--;
fi->fib_power--;
res->nh_sel = nhsel;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 79d057a939ba..200eb538fbb3 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -16,7 +16,7 @@
*
* An experimental study of compression methods for dynamic tries
* Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
- * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
+ * http://www.csc.kth.se/~snilsson/software/dyntrie2/
*
*
* IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
@@ -186,7 +186,7 @@ static inline struct tnode *node_parent_rcu(struct node *node)
{
struct tnode *ret = node_parent(node);
- return rcu_dereference(ret);
+ return rcu_dereference_rtnl(ret);
}
/* Same as rcu_assign_pointer
@@ -209,9 +209,7 @@ static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i)
{
struct node *ret = tnode_get_child(tn, i);
- return rcu_dereference_check(ret,
- rcu_read_lock_held() ||
- lockdep_rtnl_is_held());
+ return rcu_dereference_rtnl(ret);
}
static inline int tnode_child_length(const struct tnode *tn)
@@ -457,8 +455,8 @@ static struct tnode *tnode_new(t_key key, int pos, int bits)
tn->empty_children = 1<<bits;
}
- pr_debug("AT %p s=%u %lu\n", tn, (unsigned int) sizeof(struct tnode),
- (unsigned long) (sizeof(struct node) << bits));
+ pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
+ sizeof(struct node) << bits);
return tn;
}
@@ -607,11 +605,10 @@ static struct node *resize(struct trie *t, struct tnode *tn)
/* Keep root node larger */
- if (!node_parent((struct node*) tn)) {
+ if (!node_parent((struct node *)tn)) {
inflate_threshold_use = inflate_threshold_root;
halve_threshold_use = halve_threshold_root;
- }
- else {
+ } else {
inflate_threshold_use = inflate_threshold;
halve_threshold_use = halve_threshold;
}
@@ -637,7 +634,7 @@ static struct node *resize(struct trie *t, struct tnode *tn)
check_tnode(tn);
/* Return if at least one inflate is run */
- if( max_work != MAX_WORK)
+ if (max_work != MAX_WORK)
return (struct node *) tn;
/*
@@ -964,9 +961,7 @@ fib_find_node(struct trie *t, u32 key)
struct node *n;
pos = 0;
- n = rcu_dereference_check(t->trie,
- rcu_read_lock_held() ||
- lockdep_rtnl_is_held());
+ n = rcu_dereference_rtnl(t->trie);
while (n != NULL && NODE_TYPE(n) == T_TNODE) {
tn = (struct tnode *) n;
@@ -1347,7 +1342,7 @@ err:
/* should be called with rcu_read_lock */
static int check_leaf(struct trie *t, struct leaf *l,
t_key key, const struct flowi *flp,
- struct fib_result *res)
+ struct fib_result *res, int fib_flags)
{
struct leaf_info *li;
struct hlist_head *hhead = &l->list;
@@ -1361,7 +1356,7 @@ static int check_leaf(struct trie *t, struct leaf *l,
if (l->key != (key & ntohl(mask)))
continue;
- err = fib_semantic_match(&li->falh, flp, res, plen);
+ err = fib_semantic_match(&li->falh, flp, res, plen, fib_flags);
#ifdef CONFIG_IP_FIB_TRIE_STATS
if (err <= 0)
@@ -1377,7 +1372,7 @@ static int check_leaf(struct trie *t, struct leaf *l,
}
int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
- struct fib_result *res)
+ struct fib_result *res, int fib_flags)
{
struct trie *t = (struct trie *) tb->tb_data;
int ret;
@@ -1389,8 +1384,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
t_key cindex = 0;
int current_prefix_length = KEYLENGTH;
struct tnode *cn;
- t_key node_prefix, key_prefix, pref_mismatch;
- int mp;
+ t_key pref_mismatch;
rcu_read_lock();
@@ -1404,7 +1398,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
/* Just a leaf? */
if (IS_LEAF(n)) {
- ret = check_leaf(t, (struct leaf *)n, key, flp, res);
+ ret = check_leaf(t, (struct leaf *)n, key, flp, res, fib_flags);
goto found;
}
@@ -1429,7 +1423,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
}
if (IS_LEAF(n)) {
- ret = check_leaf(t, (struct leaf *)n, key, flp, res);
+ ret = check_leaf(t, (struct leaf *)n, key, flp, res, fib_flags);
if (ret > 0)
goto backtrace;
goto found;
@@ -1505,10 +1499,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
* matching prefix.
*/
- node_prefix = mask_pfx(cn->key, cn->pos);
- key_prefix = mask_pfx(key, cn->pos);
- pref_mismatch = key_prefix^node_prefix;
- mp = 0;
+ pref_mismatch = mask_pfx(cn->key ^ key, cn->pos);
/*
* In short: If skipped bits in this node do not match
@@ -1516,13 +1507,9 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
* state.directly.
*/
if (pref_mismatch) {
- while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
- mp++;
- pref_mismatch = pref_mismatch << 1;
- }
- key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
+ int mp = KEYLENGTH - fls(pref_mismatch);
- if (key_prefix != 0)
+ if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0)
goto backtrace;
if (current_prefix_length >= cn->pos)
@@ -1746,14 +1733,14 @@ static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)
/* Node empty, walk back up to parent */
c = (struct node *) p;
- } while ( (p = node_parent_rcu(c)) != NULL);
+ } while ((p = node_parent_rcu(c)) != NULL);
return NULL; /* Root of trie */
}
static struct leaf *trie_firstleaf(struct trie *t)
{
- struct tnode *n = (struct tnode *) rcu_dereference(t->trie);
+ struct tnode *n = (struct tnode *)rcu_dereference_rtnl(t->trie);
if (!n)
return NULL;
@@ -1810,6 +1797,11 @@ int fib_table_flush(struct fib_table *tb)
return found;
}
+void fib_free_table(struct fib_table *tb)
+{
+ kfree(tb);
+}
+
void fib_table_select_default(struct fib_table *tb,
const struct flowi *flp,
struct fib_result *res)
@@ -1851,7 +1843,8 @@ void fib_table_select_default(struct fib_table *tb,
if (!next_fi->fib_nh[0].nh_gw ||
next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
continue;
- fa->fa_state |= FA_S_ACCESSED;
+
+ fib_alias_accessed(fa);
if (fi == NULL) {
if (next_fi != res->fi)
@@ -2039,14 +2032,14 @@ struct fib_trie_iter {
struct seq_net_private p;
struct fib_table *tb;
struct tnode *tnode;
- unsigned index;
- unsigned depth;
+ unsigned int index;
+ unsigned int depth;
};
static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
{
struct tnode *tn = iter->tnode;
- unsigned cindex = iter->index;
+ unsigned int cindex = iter->index;
struct tnode *p;
/* A single entry routing table */
@@ -2155,7 +2148,7 @@ static void trie_collect_stats(struct trie *t, struct trie_stat *s)
*/
static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
{
- unsigned i, max, pointers, bytes, avdepth;
+ unsigned int i, max, pointers, bytes, avdepth;
if (stat->leaves)
avdepth = stat->totdepth*100 / stat->leaves;
@@ -2352,7 +2345,8 @@ static void fib_trie_seq_stop(struct seq_file *seq, void *v)
static void seq_indent(struct seq_file *seq, int n)
{
- while (n-- > 0) seq_puts(seq, " ");
+ while (n-- > 0)
+ seq_puts(seq, " ");
}
static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
@@ -2384,7 +2378,7 @@ static const char *const rtn_type_names[__RTN_MAX] = {
[RTN_XRESOLVE] = "XRESOLVE",
};
-static inline const char *rtn_type(char *buf, size_t len, unsigned t)
+static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
{
if (t < __RTN_MAX && rtn_type_names[t])
return rtn_type_names[t];
@@ -2540,13 +2534,12 @@ static void fib_route_seq_stop(struct seq_file *seq, void *v)
rcu_read_unlock();
}
-static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
+static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
{
- static unsigned type2flags[RTN_MAX + 1] = {
- [7] = RTF_REJECT, [8] = RTF_REJECT,
- };
- unsigned flags = type2flags[type];
+ unsigned int flags = 0;
+ if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
+ flags = RTF_REJECT;
if (fi && fi->fib_nh->nh_gw)
flags |= RTF_GATEWAY;
if (mask == htonl(0xFFFFFFFF))
@@ -2558,7 +2551,7 @@ static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
/*
* This outputs /proc/net/route.
* The format of the file is not supposed to be changed
- * and needs to be same as fib_hash output to avoid breaking
+ * and needs to be same as fib_hash output to avoid breaking
* legacy utilities
*/
static int fib_route_seq_show(struct seq_file *seq, void *v)
@@ -2583,7 +2576,7 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)
list_for_each_entry_rcu(fa, &li->falh, fa_list) {
const struct fib_info *fi = fa->fa_info;
- unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
+ unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
int len;
if (fa->fa_type == RTN_BROADCAST
diff --git a/net/ipv4/gre.c b/net/ipv4/gre.c
new file mode 100644
index 000000000000..c6933f2ea310
--- /dev/null
+++ b/net/ipv4/gre.c
@@ -0,0 +1,152 @@
+/*
+ * GRE over IPv4 demultiplexer driver
+ *
+ * Authors: Dmitry Kozlov (xeb@mail.ru)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/kmod.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/netdevice.h>
+#include <linux/version.h>
+#include <linux/spinlock.h>
+#include <net/protocol.h>
+#include <net/gre.h>
+
+
+static const struct gre_protocol __rcu *gre_proto[GREPROTO_MAX] __read_mostly;
+static DEFINE_SPINLOCK(gre_proto_lock);
+
+int gre_add_protocol(const struct gre_protocol *proto, u8 version)
+{
+ if (version >= GREPROTO_MAX)
+ goto err_out;
+
+ spin_lock(&gre_proto_lock);
+ if (gre_proto[version])
+ goto err_out_unlock;
+
+ rcu_assign_pointer(gre_proto[version], proto);
+ spin_unlock(&gre_proto_lock);
+ return 0;
+
+err_out_unlock:
+ spin_unlock(&gre_proto_lock);
+err_out:
+ return -1;
+}
+EXPORT_SYMBOL_GPL(gre_add_protocol);
+
+int gre_del_protocol(const struct gre_protocol *proto, u8 version)
+{
+ if (version >= GREPROTO_MAX)
+ goto err_out;
+
+ spin_lock(&gre_proto_lock);
+ if (rcu_dereference_protected(gre_proto[version],
+ lockdep_is_held(&gre_proto_lock)) != proto)
+ goto err_out_unlock;
+ rcu_assign_pointer(gre_proto[version], NULL);
+ spin_unlock(&gre_proto_lock);
+ synchronize_rcu();
+ return 0;
+
+err_out_unlock:
+ spin_unlock(&gre_proto_lock);
+err_out:
+ return -1;
+}
+EXPORT_SYMBOL_GPL(gre_del_protocol);
+
+static int gre_rcv(struct sk_buff *skb)
+{
+ const struct gre_protocol *proto;
+ u8 ver;
+ int ret;
+
+ if (!pskb_may_pull(skb, 12))
+ goto drop;
+
+ ver = skb->data[1]&0x7f;
+ if (ver >= GREPROTO_MAX)
+ goto drop;
+
+ rcu_read_lock();
+ proto = rcu_dereference(gre_proto[ver]);
+ if (!proto || !proto->handler)
+ goto drop_unlock;
+ ret = proto->handler(skb);
+ rcu_read_unlock();
+ return ret;
+
+drop_unlock:
+ rcu_read_unlock();
+drop:
+ kfree_skb(skb);
+ return NET_RX_DROP;
+}
+
+static void gre_err(struct sk_buff *skb, u32 info)
+{
+ const struct gre_protocol *proto;
+ u8 ver;
+
+ if (!pskb_may_pull(skb, 12))
+ goto drop;
+
+ ver = skb->data[1]&0x7f;
+ if (ver >= GREPROTO_MAX)
+ goto drop;
+
+ rcu_read_lock();
+ proto = rcu_dereference(gre_proto[ver]);
+ if (!proto || !proto->err_handler)
+ goto drop_unlock;
+ proto->err_handler(skb, info);
+ rcu_read_unlock();
+ return;
+
+drop_unlock:
+ rcu_read_unlock();
+drop:
+ kfree_skb(skb);
+}
+
+static const struct net_protocol net_gre_protocol = {
+ .handler = gre_rcv,
+ .err_handler = gre_err,
+ .netns_ok = 1,
+};
+
+static int __init gre_init(void)
+{
+ pr_info("GRE over IPv4 demultiplexor driver");
+
+ if (inet_add_protocol(&net_gre_protocol, IPPROTO_GRE) < 0) {
+ pr_err("gre: can't add protocol\n");
+ return -EAGAIN;
+ }
+
+ return 0;
+}
+
+static void __exit gre_exit(void)
+{
+ inet_del_protocol(&net_gre_protocol, IPPROTO_GRE);
+}
+
+module_init(gre_init);
+module_exit(gre_exit);
+
+MODULE_DESCRIPTION("GRE over IPv4 demultiplexer driver");
+MODULE_AUTHOR("D. Kozlov (xeb@mail.ru)");
+MODULE_LICENSE("GPL");
+
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index d65e9215bcd7..96bc7f9475a3 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -181,6 +181,7 @@ const struct icmp_err icmp_err_convert[] = {
.fatal = 1,
},
};
+EXPORT_SYMBOL(icmp_err_convert);
/*
* ICMP control array. This specifies what to do with each ICMP.
@@ -267,11 +268,12 @@ int xrlim_allow(struct dst_entry *dst, int timeout)
dst->rate_tokens = token;
return rc;
}
+EXPORT_SYMBOL(xrlim_allow);
static inline int icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
int type, int code)
{
- struct dst_entry *dst = &rt->u.dst;
+ struct dst_entry *dst = &rt->dst;
int rc = 1;
if (type > NR_ICMP_TYPES)
@@ -327,7 +329,7 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param,
struct sock *sk;
struct sk_buff *skb;
- sk = icmp_sk(dev_net((*rt)->u.dst.dev));
+ sk = icmp_sk(dev_net((*rt)->dst.dev));
if (ip_append_data(sk, icmp_glue_bits, icmp_param,
icmp_param->data_len+icmp_param->head_len,
icmp_param->head_len,
@@ -359,7 +361,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
{
struct ipcm_cookie ipc;
struct rtable *rt = skb_rtable(skb);
- struct net *net = dev_net(rt->u.dst.dev);
+ struct net *net = dev_net(rt->dst.dev);
struct sock *sk;
struct inet_sock *inet;
__be32 daddr;
@@ -377,7 +379,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
inet->tos = ip_hdr(skb)->tos;
daddr = ipc.addr = rt->rt_src;
ipc.opt = NULL;
- ipc.shtx.flags = 0;
+ ipc.tx_flags = 0;
if (icmp_param->replyopts.optlen) {
ipc.opt = &icmp_param->replyopts;
if (ipc.opt->srr)
@@ -427,7 +429,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
if (!rt)
goto out;
- net = dev_net(rt->u.dst.dev);
+ net = dev_net(rt->dst.dev);
/*
* Find the original header. It is expected to be valid, of course.
@@ -536,7 +538,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
inet_sk(sk)->tos = tos;
ipc.addr = iph->saddr;
ipc.opt = &icmp_param.replyopts;
- ipc.shtx.flags = 0;
+ ipc.tx_flags = 0;
{
struct flowi fl = {
@@ -596,9 +598,9 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
/* Ugh! */
orefdst = skb_in->_skb_refdst; /* save old refdst */
err = ip_route_input(skb_in, fl.fl4_dst, fl.fl4_src,
- RT_TOS(tos), rt2->u.dst.dev);
+ RT_TOS(tos), rt2->dst.dev);
- dst_release(&rt2->u.dst);
+ dst_release(&rt2->dst);
rt2 = skb_rtable(skb_in);
skb_in->_skb_refdst = orefdst; /* restore old refdst */
}
@@ -610,7 +612,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
XFRM_LOOKUP_ICMP);
switch (err) {
case 0:
- dst_release(&rt->u.dst);
+ dst_release(&rt->dst);
rt = rt2;
break;
case -EPERM:
@@ -629,7 +631,7 @@ route_done:
/* RFC says return as much as we can without exceeding 576 bytes. */
- room = dst_mtu(&rt->u.dst);
+ room = dst_mtu(&rt->dst);
if (room > 576)
room = 576;
room -= sizeof(struct iphdr) + icmp_param.replyopts.optlen;
@@ -647,6 +649,7 @@ out_unlock:
icmp_xmit_unlock(sk);
out:;
}
+EXPORT_SYMBOL(icmp_send);
/*
@@ -925,6 +928,7 @@ static void icmp_address(struct sk_buff *skb)
/*
* RFC1812 (4.3.3.9). A router SHOULD listen all replies, and complain
* loudly if an inconsistency is found.
+ * called with rcu_read_lock()
*/
static void icmp_address_reply(struct sk_buff *skb)
@@ -935,12 +939,12 @@ static void icmp_address_reply(struct sk_buff *skb)
struct in_ifaddr *ifa;
if (skb->len < 4 || !(rt->rt_flags&RTCF_DIRECTSRC))
- goto out;
+ return;
- in_dev = in_dev_get(dev);
+ in_dev = __in_dev_get_rcu(dev);
if (!in_dev)
- goto out;
- rcu_read_lock();
+ return;
+
if (in_dev->ifa_list &&
IN_DEV_LOG_MARTIANS(in_dev) &&
IN_DEV_FORWARD(in_dev)) {
@@ -958,9 +962,6 @@ static void icmp_address_reply(struct sk_buff *skb)
mp, dev->name, &rt->rt_src);
}
}
- rcu_read_unlock();
- in_dev_put(in_dev);
-out:;
}
static void icmp_discard(struct sk_buff *skb)
@@ -974,7 +975,7 @@ int icmp_rcv(struct sk_buff *skb)
{
struct icmphdr *icmph;
struct rtable *rt = skb_rtable(skb);
- struct net *net = dev_net(rt->u.dst.dev);
+ struct net *net = dev_net(rt->dst.dev);
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
struct sec_path *sp = skb_sec_path(skb);
@@ -1216,7 +1217,3 @@ int __init icmp_init(void)
{
return register_pernet_subsys(&icmp_sk_ops);
}
-
-EXPORT_SYMBOL(icmp_err_convert);
-EXPORT_SYMBOL(icmp_send);
-EXPORT_SYMBOL(xrlim_allow);
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 5fff865a4fa7..c8877c6c7216 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -312,7 +312,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
return NULL;
}
- skb_dst_set(skb, &rt->u.dst);
+ skb_dst_set(skb, &rt->dst);
skb->dev = dev;
skb_reserve(skb, LL_RESERVED_SPACE(dev));
@@ -330,7 +330,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
pip->saddr = rt->rt_src;
pip->protocol = IPPROTO_IGMP;
pip->tot_len = 0; /* filled in later */
- ip_select_ident(pip, &rt->u.dst, NULL);
+ ip_select_ident(pip, &rt->dst, NULL);
((u8*)&pip[1])[0] = IPOPT_RA;
((u8*)&pip[1])[1] = 4;
((u8*)&pip[1])[2] = 0;
@@ -660,7 +660,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
return -1;
}
- skb_dst_set(skb, &rt->u.dst);
+ skb_dst_set(skb, &rt->dst);
skb_reserve(skb, LL_RESERVED_SPACE(dev));
@@ -676,7 +676,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
iph->daddr = dst;
iph->saddr = rt->rt_src;
iph->protocol = IPPROTO_IGMP;
- ip_select_ident(iph, &rt->u.dst, NULL);
+ ip_select_ident(iph, &rt->dst, NULL);
((u8*)&iph[1])[0] = IPOPT_RA;
((u8*)&iph[1])[1] = 4;
((u8*)&iph[1])[2] = 0;
@@ -856,6 +856,18 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
igmpv3_clear_delrec(in_dev);
} else if (len < 12) {
return; /* ignore bogus packet; freed by caller */
+ } else if (IGMP_V1_SEEN(in_dev)) {
+ /* This is a v3 query with v1 queriers present */
+ max_delay = IGMP_Query_Response_Interval;
+ group = 0;
+ } else if (IGMP_V2_SEEN(in_dev)) {
+ /* this is a v3 query with v2 queriers present;
+ * Interpretation of the max_delay code is problematic here.
+ * A real v2 host would use ih_code directly, while v3 has a
+ * different encoding. We use the v3 encoding as more likely
+ * to be intended in a v3 query.
+ */
+ max_delay = IGMPV3_MRC(ih3->code)*(HZ/IGMP_TIMER_SCALE);
} else { /* v3 */
if (!pskb_may_pull(skb, sizeof(struct igmpv3_query)))
return;
@@ -916,18 +928,19 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
read_unlock(&in_dev->mc_list_lock);
}
+/* called in rcu_read_lock() section */
int igmp_rcv(struct sk_buff *skb)
{
/* This basically follows the spec line by line -- see RFC1112 */
struct igmphdr *ih;
- struct in_device *in_dev = in_dev_get(skb->dev);
+ struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
int len = skb->len;
if (in_dev == NULL)
goto drop;
if (!pskb_may_pull(skb, sizeof(struct igmphdr)))
- goto drop_ref;
+ goto drop;
switch (skb->ip_summed) {
case CHECKSUM_COMPLETE:
@@ -937,7 +950,7 @@ int igmp_rcv(struct sk_buff *skb)
case CHECKSUM_NONE:
skb->csum = 0;
if (__skb_checksum_complete(skb))
- goto drop_ref;
+ goto drop;
}
ih = igmp_hdr(skb);
@@ -957,7 +970,6 @@ int igmp_rcv(struct sk_buff *skb)
break;
case IGMP_PIM:
#ifdef CONFIG_IP_PIMSM_V1
- in_dev_put(in_dev);
return pim_rcv_v1(skb);
#endif
case IGMPV3_HOST_MEMBERSHIP_REPORT:
@@ -971,8 +983,6 @@ int igmp_rcv(struct sk_buff *skb)
break;
}
-drop_ref:
- in_dev_put(in_dev);
drop:
kfree_skb(skb);
return 0;
@@ -1246,6 +1256,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
out:
return;
}
+EXPORT_SYMBOL(ip_mc_inc_group);
/*
* Resend IGMP JOIN report; used for bonding.
@@ -1258,16 +1269,17 @@ void ip_mc_rejoin_group(struct ip_mc_list *im)
if (im->multiaddr == IGMP_ALL_HOSTS)
return;
- if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) {
- igmp_mod_timer(im, IGMP_Initial_Report_Delay);
- return;
- }
- /* else, v3 */
- im->crcount = in_dev->mr_qrv ? in_dev->mr_qrv :
- IGMP_Unsolicited_Report_Count;
- igmp_ifc_event(in_dev);
+ /* a failover is happening and switches
+ * must be notified immediately */
+ if (IGMP_V1_SEEN(in_dev))
+ igmp_send_report(in_dev, im, IGMP_HOST_MEMBERSHIP_REPORT);
+ else if (IGMP_V2_SEEN(in_dev))
+ igmp_send_report(in_dev, im, IGMPV2_HOST_MEMBERSHIP_REPORT);
+ else
+ igmp_send_report(in_dev, im, IGMPV3_HOST_MEMBERSHIP_REPORT);
#endif
}
+EXPORT_SYMBOL(ip_mc_rejoin_group);
/*
* A socket has left a multicast group on device dev
@@ -1298,6 +1310,7 @@ void ip_mc_dec_group(struct in_device *in_dev, __be32 addr)
}
}
}
+EXPORT_SYMBOL(ip_mc_dec_group);
/* Device changing type */
@@ -1405,6 +1418,7 @@ void ip_mc_destroy_dev(struct in_device *in_dev)
write_unlock_bh(&in_dev->mc_list_lock);
}
+/* RTNL is locked */
static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr)
{
struct flowi fl = { .nl_u = { .ip4_u =
@@ -1415,19 +1429,16 @@ static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr)
if (imr->imr_ifindex) {
idev = inetdev_by_index(net, imr->imr_ifindex);
- if (idev)
- __in_dev_put(idev);
return idev;
}
if (imr->imr_address.s_addr) {
- dev = ip_dev_find(net, imr->imr_address.s_addr);
+ dev = __ip_dev_find(net, imr->imr_address.s_addr, false);
if (!dev)
return NULL;
- dev_put(dev);
}
if (!dev && !ip_route_output_key(net, &rt, &fl)) {
- dev = rt->u.dst.dev;
+ dev = rt->dst.dev;
ip_rt_put(rt);
}
if (dev) {
@@ -1646,8 +1657,7 @@ static int sf_setstate(struct ip_mc_list *pmc)
if (dpsf->sf_inaddr == psf->sf_inaddr)
break;
if (!dpsf) {
- dpsf = (struct ip_sf_list *)
- kmalloc(sizeof(*dpsf), GFP_ATOMIC);
+ dpsf = kmalloc(sizeof(*dpsf), GFP_ATOMIC);
if (!dpsf)
continue;
*dpsf = *psf;
@@ -1807,6 +1817,7 @@ done:
rtnl_unlock();
return err;
}
+EXPORT_SYMBOL(ip_mc_join_group);
static void ip_sf_socklist_reclaim(struct rcu_head *rp)
{
@@ -2679,8 +2690,3 @@ int __init igmp_mc_proc_init(void)
return register_pernet_subsys(&igmp_net_ops);
}
#endif
-
-EXPORT_SYMBOL(ip_mc_dec_group);
-EXPORT_SYMBOL(ip_mc_inc_group);
-EXPORT_SYMBOL(ip_mc_join_group);
-EXPORT_SYMBOL(ip_mc_rejoin_group);
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 70eb3507c406..7174370b1195 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -84,7 +84,6 @@ int inet_csk_bind_conflict(const struct sock *sk,
}
return node != NULL;
}
-
EXPORT_SYMBOL_GPL(inet_csk_bind_conflict);
/* Obtain a reference to a local port for the given sock,
@@ -212,7 +211,6 @@ fail:
local_bh_enable();
return ret;
}
-
EXPORT_SYMBOL_GPL(inet_csk_get_port);
/*
@@ -305,7 +303,6 @@ out_err:
*err = error;
goto out;
}
-
EXPORT_SYMBOL(inet_csk_accept);
/*
@@ -327,7 +324,6 @@ void inet_csk_init_xmit_timers(struct sock *sk,
setup_timer(&sk->sk_timer, keepalive_handler, (unsigned long)sk);
icsk->icsk_pending = icsk->icsk_ack.pending = 0;
}
-
EXPORT_SYMBOL(inet_csk_init_xmit_timers);
void inet_csk_clear_xmit_timers(struct sock *sk)
@@ -340,21 +336,18 @@ void inet_csk_clear_xmit_timers(struct sock *sk)
sk_stop_timer(sk, &icsk->icsk_delack_timer);
sk_stop_timer(sk, &sk->sk_timer);
}
-
EXPORT_SYMBOL(inet_csk_clear_xmit_timers);
void inet_csk_delete_keepalive_timer(struct sock *sk)
{
sk_stop_timer(sk, &sk->sk_timer);
}
-
EXPORT_SYMBOL(inet_csk_delete_keepalive_timer);
void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len)
{
sk_reset_timer(sk, &sk->sk_timer, jiffies + len);
}
-
EXPORT_SYMBOL(inet_csk_reset_keepalive_timer);
struct dst_entry *inet_csk_route_req(struct sock *sk,
@@ -383,7 +376,7 @@ struct dst_entry *inet_csk_route_req(struct sock *sk,
goto no_route;
if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
goto route_err;
- return &rt->u.dst;
+ return &rt->dst;
route_err:
ip_rt_put(rt);
@@ -391,7 +384,6 @@ no_route:
IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
return NULL;
}
-
EXPORT_SYMBOL_GPL(inet_csk_route_req);
static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport,
@@ -433,7 +425,6 @@ struct request_sock *inet_csk_search_req(const struct sock *sk,
return req;
}
-
EXPORT_SYMBOL_GPL(inet_csk_search_req);
void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
@@ -447,11 +438,11 @@ void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout);
inet_csk_reqsk_queue_added(sk, timeout);
}
+EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add);
/* Only thing we need from tcp.h */
extern int sysctl_tcp_synack_retries;
-EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add);
/* Decide when to expire the request and when to resend SYN-ACK */
static inline void syn_ack_recalc(struct request_sock *req, const int thresh,
@@ -569,7 +560,6 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
if (lopt->qlen)
inet_csk_reset_keepalive_timer(parent, interval);
}
-
EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_prune);
struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req,
@@ -599,7 +589,6 @@ struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req,
}
return newsk;
}
-
EXPORT_SYMBOL_GPL(inet_csk_clone);
/*
@@ -630,7 +619,6 @@ void inet_csk_destroy_sock(struct sock *sk)
percpu_counter_dec(sk->sk_prot->orphan_count);
sock_put(sk);
}
-
EXPORT_SYMBOL(inet_csk_destroy_sock);
int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
@@ -665,7 +653,6 @@ int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
__reqsk_queue_destroy(&icsk->icsk_accept_queue);
return -EADDRINUSE;
}
-
EXPORT_SYMBOL_GPL(inet_csk_listen_start);
/*
@@ -720,7 +707,6 @@ void inet_csk_listen_stop(struct sock *sk)
}
WARN_ON(sk->sk_ack_backlog);
}
-
EXPORT_SYMBOL_GPL(inet_csk_listen_stop);
void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr)
@@ -732,7 +718,6 @@ void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr)
sin->sin_addr.s_addr = inet->inet_daddr;
sin->sin_port = inet->inet_dport;
}
-
EXPORT_SYMBOL_GPL(inet_csk_addr2sockaddr);
#ifdef CONFIG_COMPAT
@@ -747,7 +732,6 @@ int inet_csk_compat_getsockopt(struct sock *sk, int level, int optname,
return icsk->icsk_af_ops->getsockopt(sk, level, optname,
optval, optlen);
}
-
EXPORT_SYMBOL_GPL(inet_csk_compat_getsockopt);
int inet_csk_compat_setsockopt(struct sock *sk, int level, int optname,
@@ -761,6 +745,5 @@ int inet_csk_compat_setsockopt(struct sock *sk, int level, int optname,
return icsk->icsk_af_ops->setsockopt(sk, level, optname,
optval, optlen);
}
-
EXPORT_SYMBOL_GPL(inet_csk_compat_setsockopt);
#endif
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index e5fa2ddce320..ba8042665849 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -425,7 +425,7 @@ static int inet_diag_bc_run(const void *bc, int len,
bc += op->no;
}
}
- return (len == 0);
+ return len == 0;
}
static int valid_cc(const void *bc, int len, int cc)
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index a2ca6aed763b..5ff2a51b6d0c 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -114,7 +114,6 @@ void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f)
fq->last_in |= INET_FRAG_COMPLETE;
}
}
-
EXPORT_SYMBOL(inet_frag_kill);
static inline void frag_kfree_skb(struct netns_frags *nf, struct inet_frags *f,
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index d3e160a88219..1b344f30b463 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -99,24 +99,46 @@ void inet_put_port(struct sock *sk)
__inet_put_port(sk);
local_bh_enable();
}
-
EXPORT_SYMBOL(inet_put_port);
-void __inet_inherit_port(struct sock *sk, struct sock *child)
+int __inet_inherit_port(struct sock *sk, struct sock *child)
{
struct inet_hashinfo *table = sk->sk_prot->h.hashinfo;
- const int bhash = inet_bhashfn(sock_net(sk), inet_sk(child)->inet_num,
+ unsigned short port = inet_sk(child)->inet_num;
+ const int bhash = inet_bhashfn(sock_net(sk), port,
table->bhash_size);
struct inet_bind_hashbucket *head = &table->bhash[bhash];
struct inet_bind_bucket *tb;
spin_lock(&head->lock);
tb = inet_csk(sk)->icsk_bind_hash;
+ if (tb->port != port) {
+ /* NOTE: using tproxy and redirecting skbs to a proxy
+ * on a different listener port breaks the assumption
+ * that the listener socket's icsk_bind_hash is the same
+ * as that of the child socket. We have to look up or
+ * create a new bind bucket for the child here. */
+ struct hlist_node *node;
+ inet_bind_bucket_for_each(tb, node, &head->chain) {
+ if (net_eq(ib_net(tb), sock_net(sk)) &&
+ tb->port == port)
+ break;
+ }
+ if (!node) {
+ tb = inet_bind_bucket_create(table->bind_bucket_cachep,
+ sock_net(sk), head, port);
+ if (!tb) {
+ spin_unlock(&head->lock);
+ return -ENOMEM;
+ }
+ }
+ }
sk_add_bind_node(child, &tb->owners);
inet_csk(child)->icsk_bind_hash = tb;
spin_unlock(&head->lock);
-}
+ return 0;
+}
EXPORT_SYMBOL_GPL(__inet_inherit_port);
static inline int compute_score(struct sock *sk, struct net *net,
@@ -546,7 +568,6 @@ int inet_hash_connect(struct inet_timewait_death_row *death_row,
return __inet_hash_connect(death_row, sk, inet_sk_port_offset(sk),
__inet_check_established, __inet_hash_nolisten);
}
-
EXPORT_SYMBOL_GPL(inet_hash_connect);
void inet_hashinfo_init(struct inet_hashinfo *h)
@@ -560,5 +581,4 @@ void inet_hashinfo_init(struct inet_hashinfo *h)
i + LISTENING_NULLS_BASE);
}
}
-
EXPORT_SYMBOL_GPL(inet_hashinfo_init);
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index 6bcfe52a9c87..9e94d7cf4f8a 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -51,8 +51,8 @@
* lookups performed with disabled BHs.
*
* Serialisation issues.
- * 1. Nodes may appear in the tree only with the pool write lock held.
- * 2. Nodes may disappear from the tree only with the pool write lock held
+ * 1. Nodes may appear in the tree only with the pool lock held.
+ * 2. Nodes may disappear from the tree only with the pool lock held
* AND reference count being 0.
* 3. Nodes appears and disappears from unused node list only under
* "inet_peer_unused_lock".
@@ -64,23 +64,32 @@
* usually under some other lock to prevent node disappearing
* dtime: unused node list lock
* v4daddr: unchangeable
- * ip_id_count: idlock
+ * ip_id_count: atomic value (no lock needed)
*/
static struct kmem_cache *peer_cachep __read_mostly;
#define node_height(x) x->avl_height
-static struct inet_peer peer_fake_node = {
- .avl_left = &peer_fake_node,
- .avl_right = &peer_fake_node,
+
+#define peer_avl_empty ((struct inet_peer *)&peer_fake_node)
+#define peer_avl_empty_rcu ((struct inet_peer __rcu __force *)&peer_fake_node)
+static const struct inet_peer peer_fake_node = {
+ .avl_left = peer_avl_empty_rcu,
+ .avl_right = peer_avl_empty_rcu,
.avl_height = 0
};
-#define peer_avl_empty (&peer_fake_node)
-static struct inet_peer *peer_root = peer_avl_empty;
-static DEFINE_RWLOCK(peer_pool_lock);
+
+static struct {
+ struct inet_peer __rcu *root;
+ spinlock_t lock;
+ int total;
+} peers = {
+ .root = peer_avl_empty_rcu,
+ .lock = __SPIN_LOCK_UNLOCKED(peers.lock),
+ .total = 0,
+};
#define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */
-static int peer_total;
/* Exported for sysctl_net_ipv4. */
int inet_peer_threshold __read_mostly = 65536 + 128; /* start to throw entries more
* aggressively at this stage */
@@ -89,8 +98,13 @@ int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min
int inet_peer_gc_mintime __read_mostly = 10 * HZ;
int inet_peer_gc_maxtime __read_mostly = 120 * HZ;
-static LIST_HEAD(unused_peers);
-static DEFINE_SPINLOCK(inet_peer_unused_lock);
+static struct {
+ struct list_head list;
+ spinlock_t lock;
+} unused_peers = {
+ .list = LIST_HEAD_INIT(unused_peers.list),
+ .lock = __SPIN_LOCK_UNLOCKED(unused_peers.lock),
+};
static void peer_check_expire(unsigned long dummy);
static DEFINE_TIMER(peer_periodic_timer, peer_check_expire, 0, 0);
@@ -116,7 +130,7 @@ void __init inet_initpeers(void)
peer_cachep = kmem_cache_create("inet_peer_cache",
sizeof(struct inet_peer),
- 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
+ 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC,
NULL);
/* All the timers, started at system startup tend
@@ -131,122 +145,174 @@ void __init inet_initpeers(void)
/* Called with or without local BH being disabled. */
static void unlink_from_unused(struct inet_peer *p)
{
- spin_lock_bh(&inet_peer_unused_lock);
- list_del_init(&p->unused);
- spin_unlock_bh(&inet_peer_unused_lock);
+ if (!list_empty(&p->unused)) {
+ spin_lock_bh(&unused_peers.lock);
+ list_del_init(&p->unused);
+ spin_unlock_bh(&unused_peers.lock);
+ }
}
/*
* Called with local BH disabled and the pool lock held.
- * _stack is known to be NULL or not at compile time,
- * so compiler will optimize the if (_stack) tests.
*/
#define lookup(_daddr, _stack) \
({ \
- struct inet_peer *u, **v; \
- if (_stack != NULL) { \
- stackptr = _stack; \
- *stackptr++ = &peer_root; \
- } \
- for (u = peer_root; u != peer_avl_empty; ) { \
+ struct inet_peer *u; \
+ struct inet_peer __rcu **v; \
+ \
+ stackptr = _stack; \
+ *stackptr++ = &peers.root; \
+ for (u = rcu_dereference_protected(peers.root, \
+ lockdep_is_held(&peers.lock)); \
+ u != peer_avl_empty; ) { \
if (_daddr == u->v4daddr) \
break; \
if ((__force __u32)_daddr < (__force __u32)u->v4daddr) \
v = &u->avl_left; \
else \
v = &u->avl_right; \
- if (_stack != NULL) \
- *stackptr++ = v; \
- u = *v; \
+ *stackptr++ = v; \
+ u = rcu_dereference_protected(*v, \
+ lockdep_is_held(&peers.lock)); \
} \
u; \
})
-/* Called with local BH disabled and the pool write lock held. */
+/*
+ * Called with rcu_read_lock_bh()
+ * Because we hold no lock against a writer, its quite possible we fall
+ * in an endless loop.
+ * But every pointer we follow is guaranteed to be valid thanks to RCU.
+ * We exit from this function if number of links exceeds PEER_MAXDEPTH
+ */
+static struct inet_peer *lookup_rcu_bh(__be32 daddr)
+{
+ struct inet_peer *u = rcu_dereference_bh(peers.root);
+ int count = 0;
+
+ while (u != peer_avl_empty) {
+ if (daddr == u->v4daddr) {
+ /* Before taking a reference, check if this entry was
+ * deleted, unlink_from_pool() sets refcnt=-1 to make
+ * distinction between an unused entry (refcnt=0) and
+ * a freed one.
+ */
+ if (unlikely(!atomic_add_unless(&u->refcnt, 1, -1)))
+ u = NULL;
+ return u;
+ }
+ if ((__force __u32)daddr < (__force __u32)u->v4daddr)
+ u = rcu_dereference_bh(u->avl_left);
+ else
+ u = rcu_dereference_bh(u->avl_right);
+ if (unlikely(++count == PEER_MAXDEPTH))
+ break;
+ }
+ return NULL;
+}
+
+/* Called with local BH disabled and the pool lock held. */
#define lookup_rightempty(start) \
({ \
- struct inet_peer *u, **v; \
+ struct inet_peer *u; \
+ struct inet_peer __rcu **v; \
*stackptr++ = &start->avl_left; \
v = &start->avl_left; \
- for (u = *v; u->avl_right != peer_avl_empty; ) { \
+ for (u = rcu_dereference_protected(*v, \
+ lockdep_is_held(&peers.lock)); \
+ u->avl_right != peer_avl_empty_rcu; ) { \
v = &u->avl_right; \
*stackptr++ = v; \
- u = *v; \
+ u = rcu_dereference_protected(*v, \
+ lockdep_is_held(&peers.lock)); \
} \
u; \
})
-/* Called with local BH disabled and the pool write lock held.
+/* Called with local BH disabled and the pool lock held.
* Variable names are the proof of operation correctness.
- * Look into mm/map_avl.c for more detail description of the ideas. */
-static void peer_avl_rebalance(struct inet_peer **stack[],
- struct inet_peer ***stackend)
+ * Look into mm/map_avl.c for more detail description of the ideas.
+ */
+static void peer_avl_rebalance(struct inet_peer __rcu **stack[],
+ struct inet_peer __rcu ***stackend)
{
- struct inet_peer **nodep, *node, *l, *r;
+ struct inet_peer __rcu **nodep;
+ struct inet_peer *node, *l, *r;
int lh, rh;
while (stackend > stack) {
nodep = *--stackend;
- node = *nodep;
- l = node->avl_left;
- r = node->avl_right;
+ node = rcu_dereference_protected(*nodep,
+ lockdep_is_held(&peers.lock));
+ l = rcu_dereference_protected(node->avl_left,
+ lockdep_is_held(&peers.lock));
+ r = rcu_dereference_protected(node->avl_right,
+ lockdep_is_held(&peers.lock));
lh = node_height(l);
rh = node_height(r);
if (lh > rh + 1) { /* l: RH+2 */
struct inet_peer *ll, *lr, *lrl, *lrr;
int lrh;
- ll = l->avl_left;
- lr = l->avl_right;
+ ll = rcu_dereference_protected(l->avl_left,
+ lockdep_is_held(&peers.lock));
+ lr = rcu_dereference_protected(l->avl_right,
+ lockdep_is_held(&peers.lock));
lrh = node_height(lr);
if (lrh <= node_height(ll)) { /* ll: RH+1 */
- node->avl_left = lr; /* lr: RH or RH+1 */
- node->avl_right = r; /* r: RH */
+ RCU_INIT_POINTER(node->avl_left, lr); /* lr: RH or RH+1 */
+ RCU_INIT_POINTER(node->avl_right, r); /* r: RH */
node->avl_height = lrh + 1; /* RH+1 or RH+2 */
- l->avl_left = ll; /* ll: RH+1 */
- l->avl_right = node; /* node: RH+1 or RH+2 */
+ RCU_INIT_POINTER(l->avl_left, ll); /* ll: RH+1 */
+ RCU_INIT_POINTER(l->avl_right, node); /* node: RH+1 or RH+2 */
l->avl_height = node->avl_height + 1;
- *nodep = l;
+ RCU_INIT_POINTER(*nodep, l);
} else { /* ll: RH, lr: RH+1 */
- lrl = lr->avl_left; /* lrl: RH or RH-1 */
- lrr = lr->avl_right; /* lrr: RH or RH-1 */
- node->avl_left = lrr; /* lrr: RH or RH-1 */
- node->avl_right = r; /* r: RH */
+ lrl = rcu_dereference_protected(lr->avl_left,
+ lockdep_is_held(&peers.lock)); /* lrl: RH or RH-1 */
+ lrr = rcu_dereference_protected(lr->avl_right,
+ lockdep_is_held(&peers.lock)); /* lrr: RH or RH-1 */
+ RCU_INIT_POINTER(node->avl_left, lrr); /* lrr: RH or RH-1 */
+ RCU_INIT_POINTER(node->avl_right, r); /* r: RH */
node->avl_height = rh + 1; /* node: RH+1 */
- l->avl_left = ll; /* ll: RH */
- l->avl_right = lrl; /* lrl: RH or RH-1 */
+ RCU_INIT_POINTER(l->avl_left, ll); /* ll: RH */
+ RCU_INIT_POINTER(l->avl_right, lrl); /* lrl: RH or RH-1 */
l->avl_height = rh + 1; /* l: RH+1 */
- lr->avl_left = l; /* l: RH+1 */
- lr->avl_right = node; /* node: RH+1 */
+ RCU_INIT_POINTER(lr->avl_left, l); /* l: RH+1 */
+ RCU_INIT_POINTER(lr->avl_right, node); /* node: RH+1 */
lr->avl_height = rh + 2;
- *nodep = lr;
+ RCU_INIT_POINTER(*nodep, lr);
}
} else if (rh > lh + 1) { /* r: LH+2 */
struct inet_peer *rr, *rl, *rlr, *rll;
int rlh;
- rr = r->avl_right;
- rl = r->avl_left;
+ rr = rcu_dereference_protected(r->avl_right,
+ lockdep_is_held(&peers.lock));
+ rl = rcu_dereference_protected(r->avl_left,
+ lockdep_is_held(&peers.lock));
rlh = node_height(rl);
if (rlh <= node_height(rr)) { /* rr: LH+1 */
- node->avl_right = rl; /* rl: LH or LH+1 */
- node->avl_left = l; /* l: LH */
+ RCU_INIT_POINTER(node->avl_right, rl); /* rl: LH or LH+1 */
+ RCU_INIT_POINTER(node->avl_left, l); /* l: LH */
node->avl_height = rlh + 1; /* LH+1 or LH+2 */
- r->avl_right = rr; /* rr: LH+1 */
- r->avl_left = node; /* node: LH+1 or LH+2 */
+ RCU_INIT_POINTER(r->avl_right, rr); /* rr: LH+1 */
+ RCU_INIT_POINTER(r->avl_left, node); /* node: LH+1 or LH+2 */
r->avl_height = node->avl_height + 1;
- *nodep = r;
+ RCU_INIT_POINTER(*nodep, r);
} else { /* rr: RH, rl: RH+1 */
- rlr = rl->avl_right; /* rlr: LH or LH-1 */
- rll = rl->avl_left; /* rll: LH or LH-1 */
- node->avl_right = rll; /* rll: LH or LH-1 */
- node->avl_left = l; /* l: LH */
+ rlr = rcu_dereference_protected(rl->avl_right,
+ lockdep_is_held(&peers.lock)); /* rlr: LH or LH-1 */
+ rll = rcu_dereference_protected(rl->avl_left,
+ lockdep_is_held(&peers.lock)); /* rll: LH or LH-1 */
+ RCU_INIT_POINTER(node->avl_right, rll); /* rll: LH or LH-1 */
+ RCU_INIT_POINTER(node->avl_left, l); /* l: LH */
node->avl_height = lh + 1; /* node: LH+1 */
- r->avl_right = rr; /* rr: LH */
- r->avl_left = rlr; /* rlr: LH or LH-1 */
+ RCU_INIT_POINTER(r->avl_right, rr); /* rr: LH */
+ RCU_INIT_POINTER(r->avl_left, rlr); /* rlr: LH or LH-1 */
r->avl_height = lh + 1; /* r: LH+1 */
- rl->avl_right = r; /* r: LH+1 */
- rl->avl_left = node; /* node: LH+1 */
+ RCU_INIT_POINTER(rl->avl_right, r); /* r: LH+1 */
+ RCU_INIT_POINTER(rl->avl_left, node); /* node: LH+1 */
rl->avl_height = lh + 2;
- *nodep = rl;
+ RCU_INIT_POINTER(*nodep, rl);
}
} else {
node->avl_height = (lh > rh ? lh : rh) + 1;
@@ -254,15 +320,21 @@ static void peer_avl_rebalance(struct inet_peer **stack[],
}
}
-/* Called with local BH disabled and the pool write lock held. */
+/* Called with local BH disabled and the pool lock held. */
#define link_to_pool(n) \
do { \
n->avl_height = 1; \
- n->avl_left = peer_avl_empty; \
- n->avl_right = peer_avl_empty; \
- **--stackptr = n; \
+ n->avl_left = peer_avl_empty_rcu; \
+ n->avl_right = peer_avl_empty_rcu; \
+ /* lockless readers can catch us now */ \
+ rcu_assign_pointer(**--stackptr, n); \
peer_avl_rebalance(stack, stackptr); \
-} while(0)
+} while (0)
+
+static void inetpeer_free_rcu(struct rcu_head *head)
+{
+ kmem_cache_free(peer_cachep, container_of(head, struct inet_peer, rcu));
+}
/* May be called with local BH enabled. */
static void unlink_from_pool(struct inet_peer *p)
@@ -271,31 +343,33 @@ static void unlink_from_pool(struct inet_peer *p)
do_free = 0;
- write_lock_bh(&peer_pool_lock);
+ spin_lock_bh(&peers.lock);
/* Check the reference counter. It was artificially incremented by 1
- * in cleanup() function to prevent sudden disappearing. If the
- * reference count is still 1 then the node is referenced only as `p'
- * here and from the pool. So under the exclusive pool lock it's safe
- * to remove the node and free it later. */
- if (atomic_read(&p->refcnt) == 1) {
- struct inet_peer **stack[PEER_MAXDEPTH];
- struct inet_peer ***stackptr, ***delp;
+ * in cleanup() function to prevent sudden disappearing. If we can
+ * atomically (because of lockless readers) take this last reference,
+ * it's safe to remove the node and free it later.
+ * We use refcnt=-1 to alert lockless readers this entry is deleted.
+ */
+ if (atomic_cmpxchg(&p->refcnt, 1, -1) == 1) {
+ struct inet_peer __rcu **stack[PEER_MAXDEPTH];
+ struct inet_peer __rcu ***stackptr, ***delp;
if (lookup(p->v4daddr, stack) != p)
BUG();
delp = stackptr - 1; /* *delp[0] == p */
- if (p->avl_left == peer_avl_empty) {
+ if (p->avl_left == peer_avl_empty_rcu) {
*delp[0] = p->avl_right;
--stackptr;
} else {
/* look for a node to insert instead of p */
struct inet_peer *t;
t = lookup_rightempty(p);
- BUG_ON(*stackptr[-1] != t);
+ BUG_ON(rcu_dereference_protected(*stackptr[-1],
+ lockdep_is_held(&peers.lock)) != t);
**--stackptr = t->avl_left;
/* t is removed, t->v4daddr > x->v4daddr for any
* x in p->avl_left subtree.
* Put t in the old place of p. */
- *delp[0] = t;
+ RCU_INIT_POINTER(*delp[0], t);
t->avl_left = p->avl_left;
t->avl_right = p->avl_right;
t->avl_height = p->avl_height;
@@ -303,20 +377,21 @@ static void unlink_from_pool(struct inet_peer *p)
delp[1] = &t->avl_left; /* was &p->avl_left */
}
peer_avl_rebalance(stack, stackptr);
- peer_total--;
+ peers.total--;
do_free = 1;
}
- write_unlock_bh(&peer_pool_lock);
+ spin_unlock_bh(&peers.lock);
if (do_free)
- kmem_cache_free(peer_cachep, p);
+ call_rcu_bh(&p->rcu, inetpeer_free_rcu);
else
/* The node is used again. Decrease the reference counter
* back. The loop "cleanup -> unlink_from_unused
* -> unlink_from_pool -> putpeer -> link_to_unused
* -> cleanup (for the same node)"
* doesn't really exist because the entry will have a
- * recent deletion time and will not be cleaned again soon. */
+ * recent deletion time and will not be cleaned again soon.
+ */
inet_putpeer(p);
}
@@ -326,16 +401,16 @@ static int cleanup_once(unsigned long ttl)
struct inet_peer *p = NULL;
/* Remove the first entry from the list of unused nodes. */
- spin_lock_bh(&inet_peer_unused_lock);
- if (!list_empty(&unused_peers)) {
+ spin_lock_bh(&unused_peers.lock);
+ if (!list_empty(&unused_peers.list)) {
__u32 delta;
- p = list_first_entry(&unused_peers, struct inet_peer, unused);
+ p = list_first_entry(&unused_peers.list, struct inet_peer, unused);
delta = (__u32)jiffies - p->dtime;
if (delta < ttl) {
/* Do not prune fresh entries. */
- spin_unlock_bh(&inet_peer_unused_lock);
+ spin_unlock_bh(&unused_peers.lock);
return -1;
}
@@ -345,7 +420,7 @@ static int cleanup_once(unsigned long ttl)
* before unlink_from_pool() call. */
atomic_inc(&p->refcnt);
}
- spin_unlock_bh(&inet_peer_unused_lock);
+ spin_unlock_bh(&unused_peers.lock);
if (p == NULL)
/* It means that the total number of USED entries has
@@ -360,62 +435,56 @@ static int cleanup_once(unsigned long ttl)
/* Called with or without local BH being disabled. */
struct inet_peer *inet_getpeer(__be32 daddr, int create)
{
- struct inet_peer *p, *n;
- struct inet_peer **stack[PEER_MAXDEPTH], ***stackptr;
+ struct inet_peer *p;
+ struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr;
- /* Look up for the address quickly. */
- read_lock_bh(&peer_pool_lock);
- p = lookup(daddr, NULL);
- if (p != peer_avl_empty)
- atomic_inc(&p->refcnt);
- read_unlock_bh(&peer_pool_lock);
+ /* Look up for the address quickly, lockless.
+ * Because of a concurrent writer, we might not find an existing entry.
+ */
+ rcu_read_lock_bh();
+ p = lookup_rcu_bh(daddr);
+ rcu_read_unlock_bh();
+
+ if (p) {
+ /* The existing node has been found.
+ * Remove the entry from unused list if it was there.
+ */
+ unlink_from_unused(p);
+ return p;
+ }
+ /* retry an exact lookup, taking the lock before.
+ * At least, nodes should be hot in our cache.
+ */
+ spin_lock_bh(&peers.lock);
+ p = lookup(daddr, stack);
if (p != peer_avl_empty) {
- /* The existing node has been found. */
+ atomic_inc(&p->refcnt);
+ spin_unlock_bh(&peers.lock);
/* Remove the entry from unused list if it was there. */
unlink_from_unused(p);
return p;
}
+ p = create ? kmem_cache_alloc(peer_cachep, GFP_ATOMIC) : NULL;
+ if (p) {
+ p->v4daddr = daddr;
+ atomic_set(&p->refcnt, 1);
+ atomic_set(&p->rid, 0);
+ atomic_set(&p->ip_id_count, secure_ip_id(daddr));
+ p->tcp_ts_stamp = 0;
+ INIT_LIST_HEAD(&p->unused);
+
+
+ /* Link the node. */
+ link_to_pool(p);
+ peers.total++;
+ }
+ spin_unlock_bh(&peers.lock);
- if (!create)
- return NULL;
-
- /* Allocate the space outside the locked region. */
- n = kmem_cache_alloc(peer_cachep, GFP_ATOMIC);
- if (n == NULL)
- return NULL;
- n->v4daddr = daddr;
- atomic_set(&n->refcnt, 1);
- atomic_set(&n->rid, 0);
- atomic_set(&n->ip_id_count, secure_ip_id(daddr));
- n->tcp_ts_stamp = 0;
-
- write_lock_bh(&peer_pool_lock);
- /* Check if an entry has suddenly appeared. */
- p = lookup(daddr, stack);
- if (p != peer_avl_empty)
- goto out_free;
-
- /* Link the node. */
- link_to_pool(n);
- INIT_LIST_HEAD(&n->unused);
- peer_total++;
- write_unlock_bh(&peer_pool_lock);
-
- if (peer_total >= inet_peer_threshold)
+ if (peers.total >= inet_peer_threshold)
/* Remove one less-recently-used entry. */
cleanup_once(0);
- return n;
-
-out_free:
- /* The appropriate node is already in the pool. */
- atomic_inc(&p->refcnt);
- write_unlock_bh(&peer_pool_lock);
- /* Remove the entry from unused list if it was there. */
- unlink_from_unused(p);
- /* Free preallocated the preallocated node. */
- kmem_cache_free(peer_cachep, n);
return p;
}
@@ -425,12 +494,12 @@ static void peer_check_expire(unsigned long dummy)
unsigned long now = jiffies;
int ttl;
- if (peer_total >= inet_peer_threshold)
+ if (peers.total >= inet_peer_threshold)
ttl = inet_peer_minttl;
else
ttl = inet_peer_maxttl
- (inet_peer_maxttl - inet_peer_minttl) / HZ *
- peer_total / inet_peer_threshold * HZ;
+ peers.total / inet_peer_threshold * HZ;
while (!cleanup_once(ttl)) {
if (jiffies != now)
break;
@@ -439,22 +508,25 @@ static void peer_check_expire(unsigned long dummy)
/* Trigger the timer after inet_peer_gc_mintime .. inet_peer_gc_maxtime
* interval depending on the total number of entries (more entries,
* less interval). */
- if (peer_total >= inet_peer_threshold)
+ if (peers.total >= inet_peer_threshold)
peer_periodic_timer.expires = jiffies + inet_peer_gc_mintime;
else
peer_periodic_timer.expires = jiffies
+ inet_peer_gc_maxtime
- (inet_peer_gc_maxtime - inet_peer_gc_mintime) / HZ *
- peer_total / inet_peer_threshold * HZ;
+ peers.total / inet_peer_threshold * HZ;
add_timer(&peer_periodic_timer);
}
void inet_putpeer(struct inet_peer *p)
{
- spin_lock_bh(&inet_peer_unused_lock);
- if (atomic_dec_and_test(&p->refcnt)) {
- list_add_tail(&p->unused, &unused_peers);
+ local_bh_disable();
+
+ if (atomic_dec_and_lock(&p->refcnt, &unused_peers.lock)) {
+ list_add_tail(&p->unused, &unused_peers.list);
p->dtime = (__u32)jiffies;
+ spin_unlock(&unused_peers.lock);
}
- spin_unlock_bh(&inet_peer_unused_lock);
+
+ local_bh_enable();
}
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index 56cdf68a074c..99461f09320f 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -87,16 +87,16 @@ int ip_forward(struct sk_buff *skb)
if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
goto sr_failed;
- if (unlikely(skb->len > dst_mtu(&rt->u.dst) && !skb_is_gso(skb) &&
+ if (unlikely(skb->len > dst_mtu(&rt->dst) && !skb_is_gso(skb) &&
(ip_hdr(skb)->frag_off & htons(IP_DF))) && !skb->local_df) {
- IP_INC_STATS(dev_net(rt->u.dst.dev), IPSTATS_MIB_FRAGFAILS);
+ IP_INC_STATS(dev_net(rt->dst.dev), IPSTATS_MIB_FRAGFAILS);
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
- htonl(dst_mtu(&rt->u.dst)));
+ htonl(dst_mtu(&rt->dst)));
goto drop;
}
/* We are about to mangle packet. Copy it! */
- if (skb_cow(skb, LL_RESERVED_SPACE(rt->u.dst.dev)+rt->u.dst.header_len))
+ if (skb_cow(skb, LL_RESERVED_SPACE(rt->dst.dev)+rt->dst.header_len))
goto drop;
iph = ip_hdr(skb);
@@ -113,7 +113,7 @@ int ip_forward(struct sk_buff *skb)
skb->priority = rt_tos2priority(iph->tos);
return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, skb, skb->dev,
- rt->u.dst.dev, ip_forward_finish);
+ rt->dst.dev, ip_forward_finish);
sr_failed:
/*
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 75347ea70ea0..168440834ade 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -116,19 +116,16 @@ static int ip4_frag_match(struct inet_frag_queue *q, void *a)
struct ip4_create_arg *arg = a;
qp = container_of(q, struct ipq, q);
- return (qp->id == arg->iph->id &&
+ return qp->id == arg->iph->id &&
qp->saddr == arg->iph->saddr &&
qp->daddr == arg->iph->daddr &&
qp->protocol == arg->iph->protocol &&
- qp->user == arg->user);
+ qp->user == arg->user;
}
/* Memory Tracking Functions. */
-static __inline__ void frag_kfree_skb(struct netns_frags *nf,
- struct sk_buff *skb, int *work)
+static void frag_kfree_skb(struct netns_frags *nf, struct sk_buff *skb)
{
- if (work)
- *work -= skb->truesize;
atomic_sub(skb->truesize, &nf->mem);
kfree_skb(skb);
}
@@ -309,7 +306,7 @@ static int ip_frag_reinit(struct ipq *qp)
fp = qp->q.fragments;
do {
struct sk_buff *xp = fp->next;
- frag_kfree_skb(qp->q.net, fp, NULL);
+ frag_kfree_skb(qp->q.net, fp);
fp = xp;
} while (fp);
@@ -317,6 +314,7 @@ static int ip_frag_reinit(struct ipq *qp)
qp->q.len = 0;
qp->q.meat = 0;
qp->q.fragments = NULL;
+ qp->q.fragments_tail = NULL;
qp->iif = 0;
return 0;
@@ -389,6 +387,11 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
* in the chain of fragments so far. We must know where to put
* this fragment, right?
*/
+ prev = qp->q.fragments_tail;
+ if (!prev || FRAG_CB(prev)->offset < offset) {
+ next = NULL;
+ goto found;
+ }
prev = NULL;
for (next = qp->q.fragments; next != NULL; next = next->next) {
if (FRAG_CB(next)->offset >= offset)
@@ -396,6 +399,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
prev = next;
}
+found:
/* We found where to put this one. Check for overlap with
* preceding fragment, and, if needed, align things so that
* any overlaps are eliminated.
@@ -446,7 +450,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
qp->q.fragments = next;
qp->q.meat -= free_it->len;
- frag_kfree_skb(qp->q.net, free_it, NULL);
+ frag_kfree_skb(qp->q.net, free_it);
}
}
@@ -454,6 +458,8 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
/* Insert this fragment in the chain of fragments. */
skb->next = next;
+ if (!next)
+ qp->q.fragments_tail = skb;
if (prev)
prev->next = skb;
else
@@ -507,6 +513,8 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
goto out_nomem;
fp->next = head->next;
+ if (!fp->next)
+ qp->q.fragments_tail = fp;
prev->next = fp;
skb_morph(head, qp->q.fragments);
@@ -534,7 +542,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
/* If the first fragment is fragmented itself, we split
* it to two chunks: the first with data and paged part
* and the second, holding only fragments. */
- if (skb_has_frags(head)) {
+ if (skb_has_frag_list(head)) {
struct sk_buff *clone;
int i, plen = 0;
@@ -556,7 +564,6 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
skb_shinfo(head)->frag_list = head->next;
skb_push(head, head->data - skb_network_header(head));
- atomic_sub(head->truesize, &qp->q.net->mem);
for (fp=head->next; fp; fp = fp->next) {
head->data_len += fp->len;
@@ -566,8 +573,8 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
else if (head->ip_summed == CHECKSUM_COMPLETE)
head->csum = csum_add(head->csum, fp->csum);
head->truesize += fp->truesize;
- atomic_sub(fp->truesize, &qp->q.net->mem);
}
+ atomic_sub(head->truesize, &qp->q.net->mem);
head->next = NULL;
head->dev = dev;
@@ -578,6 +585,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
iph->tot_len = htons(len);
IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS);
qp->q.fragments = NULL;
+ qp->q.fragments_tail = NULL;
return 0;
out_nomem:
@@ -624,6 +632,7 @@ int ip_defrag(struct sk_buff *skb, u32 user)
kfree_skb(skb);
return -ENOMEM;
}
+EXPORT_SYMBOL(ip_defrag);
#ifdef CONFIG_SYSCTL
static int zero;
@@ -777,5 +786,3 @@ void __init ipfrag_init(void)
ip4_frags.secret_interval = 10 * 60 * HZ;
inet_frags_init(&ip4_frags);
}
-
-EXPORT_SYMBOL(ip_defrag);
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 32618e11076d..70ff77f02eee 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -44,8 +44,9 @@
#include <net/net_namespace.h>
#include <net/netns/generic.h>
#include <net/rtnetlink.h>
+#include <net/gre.h>
-#ifdef CONFIG_IPV6
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
#include <net/ipv6.h>
#include <net/ip6_fib.h>
#include <net/ip6_route.h>
@@ -63,13 +64,13 @@
We cannot track such dead loops during route installation,
it is infeasible task. The most general solutions would be
to keep skb->encapsulation counter (sort of local ttl),
- and silently drop packet when it expires. It is the best
+ and silently drop packet when it expires. It is a good
solution, but it supposes maintaing new variable in ALL
skb, even if no tunneling is used.
- Current solution: HARD_TX_LOCK lock breaks dead loops.
-
-
+ Current solution: xmit_recursion breaks dead loops. This is a percpu
+ counter, since when we enter the first ndo_xmit(), cpu migration is
+ forbidden. We force an exit if this counter reaches RECURSION_LIMIT
2. Networking dead loops would not kill routers, but would really
kill network. IP hop limit plays role of "t->recursion" in this case,
@@ -128,7 +129,7 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev);
static int ipgre_net_id __read_mostly;
struct ipgre_net {
- struct ip_tunnel *tunnels[4][HASH_SIZE];
+ struct ip_tunnel __rcu *tunnels[4][HASH_SIZE];
struct net_device *fb_tunnel_dev;
};
@@ -158,13 +159,40 @@ struct ipgre_net {
#define tunnels_l tunnels[1]
#define tunnels_wc tunnels[0]
/*
- * Locking : hash tables are protected by RCU and a spinlock
+ * Locking : hash tables are protected by RCU and RTNL
*/
-static DEFINE_SPINLOCK(ipgre_lock);
#define for_each_ip_tunnel_rcu(start) \
for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
+/* often modified stats are per cpu, other are shared (netdev->stats) */
+struct pcpu_tstats {
+ unsigned long rx_packets;
+ unsigned long rx_bytes;
+ unsigned long tx_packets;
+ unsigned long tx_bytes;
+};
+
+static struct net_device_stats *ipgre_get_stats(struct net_device *dev)
+{
+ struct pcpu_tstats sum = { 0 };
+ int i;
+
+ for_each_possible_cpu(i) {
+ const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
+
+ sum.rx_packets += tstats->rx_packets;
+ sum.rx_bytes += tstats->rx_bytes;
+ sum.tx_packets += tstats->tx_packets;
+ sum.tx_bytes += tstats->tx_bytes;
+ }
+ dev->stats.rx_packets = sum.rx_packets;
+ dev->stats.rx_bytes = sum.rx_bytes;
+ dev->stats.tx_packets = sum.tx_packets;
+ dev->stats.tx_bytes = sum.tx_bytes;
+ return &dev->stats;
+}
+
/* Given src, dst and key, find appropriate for input tunnel. */
static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
@@ -173,8 +201,8 @@ static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
{
struct net *net = dev_net(dev);
int link = dev->ifindex;
- unsigned h0 = HASH(remote);
- unsigned h1 = HASH(key);
+ unsigned int h0 = HASH(remote);
+ unsigned int h1 = HASH(key);
struct ip_tunnel *t, *cand = NULL;
struct ipgre_net *ign = net_generic(net, ipgre_net_id);
int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
@@ -289,13 +317,13 @@ static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
return NULL;
}
-static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
+static struct ip_tunnel __rcu **__ipgre_bucket(struct ipgre_net *ign,
struct ip_tunnel_parm *parms)
{
__be32 remote = parms->iph.daddr;
__be32 local = parms->iph.saddr;
__be32 key = parms->i_key;
- unsigned h = HASH(key);
+ unsigned int h = HASH(key);
int prio = 0;
if (local)
@@ -308,7 +336,7 @@ static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
return &ign->tunnels[prio][h];
}
-static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
+static inline struct ip_tunnel __rcu **ipgre_bucket(struct ipgre_net *ign,
struct ip_tunnel *t)
{
return __ipgre_bucket(ign, &t->parms);
@@ -316,23 +344,22 @@ static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
{
- struct ip_tunnel **tp = ipgre_bucket(ign, t);
+ struct ip_tunnel __rcu **tp = ipgre_bucket(ign, t);
- spin_lock_bh(&ipgre_lock);
- t->next = *tp;
+ rcu_assign_pointer(t->next, rtnl_dereference(*tp));
rcu_assign_pointer(*tp, t);
- spin_unlock_bh(&ipgre_lock);
}
static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
{
- struct ip_tunnel **tp;
-
- for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) {
- if (t == *tp) {
- spin_lock_bh(&ipgre_lock);
- *tp = t->next;
- spin_unlock_bh(&ipgre_lock);
+ struct ip_tunnel __rcu **tp;
+ struct ip_tunnel *iter;
+
+ for (tp = ipgre_bucket(ign, t);
+ (iter = rtnl_dereference(*tp)) != NULL;
+ tp = &iter->next) {
+ if (t == iter) {
+ rcu_assign_pointer(*tp, t->next);
break;
}
}
@@ -346,10 +373,13 @@ static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
__be32 local = parms->iph.saddr;
__be32 key = parms->i_key;
int link = parms->link;
- struct ip_tunnel *t, **tp;
+ struct ip_tunnel *t;
+ struct ip_tunnel __rcu **tp;
struct ipgre_net *ign = net_generic(net, ipgre_net_id);
- for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next)
+ for (tp = __ipgre_bucket(ign, parms);
+ (t = rtnl_dereference(*tp)) != NULL;
+ tp = &t->next)
if (local == t->parms.iph.saddr &&
remote == t->parms.iph.daddr &&
key == t->parms.i_key &&
@@ -360,7 +390,7 @@ static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
return t;
}
-static struct ip_tunnel * ipgre_tunnel_locate(struct net *net,
+static struct ip_tunnel *ipgre_tunnel_locate(struct net *net,
struct ip_tunnel_parm *parms, int create)
{
struct ip_tunnel *t, *nt;
@@ -582,7 +612,7 @@ static int ipgre_rcv(struct sk_buff *skb)
if ((tunnel = ipgre_tunnel_lookup(skb->dev,
iph->saddr, iph->daddr, key,
gre_proto))) {
- struct net_device_stats *stats = &tunnel->dev->stats;
+ struct pcpu_tstats *tstats;
secpath_reset(skb);
@@ -606,22 +636,22 @@ static int ipgre_rcv(struct sk_buff *skb)
/* Looped back packet, drop it! */
if (skb_rtable(skb)->fl.iif == 0)
goto drop;
- stats->multicast++;
+ tunnel->dev->stats.multicast++;
skb->pkt_type = PACKET_BROADCAST;
}
#endif
if (((flags&GRE_CSUM) && csum) ||
(!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
- stats->rx_crc_errors++;
- stats->rx_errors++;
+ tunnel->dev->stats.rx_crc_errors++;
+ tunnel->dev->stats.rx_errors++;
goto drop;
}
if (tunnel->parms.i_flags&GRE_SEQ) {
if (!(flags&GRE_SEQ) ||
(tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
- stats->rx_fifo_errors++;
- stats->rx_errors++;
+ tunnel->dev->stats.rx_fifo_errors++;
+ tunnel->dev->stats.rx_errors++;
goto drop;
}
tunnel->i_seqno = seqno + 1;
@@ -630,8 +660,8 @@ static int ipgre_rcv(struct sk_buff *skb)
/* Warning: All skb pointers will be invalidated! */
if (tunnel->dev->type == ARPHRD_ETHER) {
if (!pskb_may_pull(skb, ETH_HLEN)) {
- stats->rx_length_errors++;
- stats->rx_errors++;
+ tunnel->dev->stats.rx_length_errors++;
+ tunnel->dev->stats.rx_errors++;
goto drop;
}
@@ -640,14 +670,19 @@ static int ipgre_rcv(struct sk_buff *skb)
skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
}
- skb_tunnel_rx(skb, tunnel->dev);
+ tstats = this_cpu_ptr(tunnel->dev->tstats);
+ tstats->rx_packets++;
+ tstats->rx_bytes += skb->len;
+
+ __skb_tunnel_rx(skb, tunnel->dev);
skb_reset_network_header(skb);
ipgre_ecn_decapsulate(iph, skb);
netif_rx(skb);
+
rcu_read_unlock();
- return(0);
+ return 0;
}
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
@@ -655,20 +690,19 @@ drop:
rcu_read_unlock();
drop_nolock:
kfree_skb(skb);
- return(0);
+ return 0;
}
static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
- struct net_device_stats *stats = &dev->stats;
- struct netdev_queue *txq = netdev_get_tx_queue(dev, 0);
+ struct pcpu_tstats *tstats;
struct iphdr *old_iph = ip_hdr(skb);
struct iphdr *tiph;
u8 tos;
__be16 df;
struct rtable *rt; /* Route to the other host */
- struct net_device *tdev; /* Device to other host */
+ struct net_device *tdev; /* Device to other host */
struct iphdr *iph; /* Our new IP header */
unsigned int max_headroom; /* The extra header space needed */
int gre_hlen;
@@ -690,7 +724,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
/* NBMA tunnel */
if (skb_dst(skb) == NULL) {
- stats->tx_fifo_errors++;
+ dev->stats.tx_fifo_errors++;
goto tx_error;
}
@@ -699,7 +733,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
if ((dst = rt->rt_gateway) == 0)
goto tx_error_icmp;
}
-#ifdef CONFIG_IPV6
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
else if (skb->protocol == htons(ETH_P_IPV6)) {
struct in6_addr *addr6;
int addr_type;
@@ -731,31 +765,39 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
tos = 0;
if (skb->protocol == htons(ETH_P_IP))
tos = old_iph->tos;
+ else if (skb->protocol == htons(ETH_P_IPV6))
+ tos = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
}
{
- struct flowi fl = { .oif = tunnel->parms.link,
- .nl_u = { .ip4_u =
- { .daddr = dst,
- .saddr = tiph->saddr,
- .tos = RT_TOS(tos) } },
- .proto = IPPROTO_GRE };
+ struct flowi fl = {
+ .oif = tunnel->parms.link,
+ .nl_u = {
+ .ip4_u = {
+ .daddr = dst,
+ .saddr = tiph->saddr,
+ .tos = RT_TOS(tos)
+ }
+ },
+ .proto = IPPROTO_GRE
+ }
+;
if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
- stats->tx_carrier_errors++;
+ dev->stats.tx_carrier_errors++;
goto tx_error;
}
}
- tdev = rt->u.dst.dev;
+ tdev = rt->dst.dev;
if (tdev == dev) {
ip_rt_put(rt);
- stats->collisions++;
+ dev->stats.collisions++;
goto tx_error;
}
df = tiph->frag_off;
if (df)
- mtu = dst_mtu(&rt->u.dst) - dev->hard_header_len - tunnel->hlen;
+ mtu = dst_mtu(&rt->dst) - dev->hard_header_len - tunnel->hlen;
else
mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
@@ -772,7 +814,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
goto tx_error;
}
}
-#ifdef CONFIG_IPV6
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
else if (skb->protocol == htons(ETH_P_IPV6)) {
struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
@@ -803,7 +845,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
tunnel->err_count = 0;
}
- max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->u.dst.header_len;
+ max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->dst.header_len;
if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
(skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
@@ -812,7 +854,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
dev->needed_headroom = max_headroom;
if (!new_skb) {
ip_rt_put(rt);
- txq->tx_dropped++;
+ dev->stats.tx_dropped++;
dev_kfree_skb(skb);
return NETDEV_TX_OK;
}
@@ -830,7 +872,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
IPSKB_REROUTED);
skb_dst_drop(skb);
- skb_dst_set(skb, &rt->u.dst);
+ skb_dst_set(skb, &rt->dst);
/*
* Push down and install the IPIP header.
@@ -848,12 +890,12 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
if ((iph->ttl = tiph->ttl) == 0) {
if (skb->protocol == htons(ETH_P_IP))
iph->ttl = old_iph->ttl;
-#ifdef CONFIG_IPV6
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
else if (skb->protocol == htons(ETH_P_IPV6))
iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit;
#endif
else
- iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
+ iph->ttl = dst_metric(&rt->dst, RTAX_HOPLIMIT);
}
((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
@@ -879,15 +921,15 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
}
nf_reset(skb);
-
- IPTUNNEL_XMIT();
+ tstats = this_cpu_ptr(dev->tstats);
+ __IPTUNNEL_XMIT(tstats, &dev->stats);
return NETDEV_TX_OK;
tx_error_icmp:
dst_link_failure(skb);
tx_error:
- stats->tx_errors++;
+ dev->stats.tx_errors++;
dev_kfree_skb(skb);
return NETDEV_TX_OK;
}
@@ -907,15 +949,21 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev)
/* Guess output device to choose reasonable mtu and needed_headroom */
if (iph->daddr) {
- struct flowi fl = { .oif = tunnel->parms.link,
- .nl_u = { .ip4_u =
- { .daddr = iph->daddr,
- .saddr = iph->saddr,
- .tos = RT_TOS(iph->tos) } },
- .proto = IPPROTO_GRE };
+ struct flowi fl = {
+ .oif = tunnel->parms.link,
+ .nl_u = {
+ .ip4_u = {
+ .daddr = iph->daddr,
+ .saddr = iph->saddr,
+ .tos = RT_TOS(iph->tos)
+ }
+ },
+ .proto = IPPROTO_GRE
+ };
struct rtable *rt;
+
if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
- tdev = rt->u.dst.dev;
+ tdev = rt->dst.dev;
ip_rt_put(rt);
}
@@ -1010,7 +1058,7 @@ ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
break;
}
} else {
- unsigned nflags = 0;
+ unsigned int nflags = 0;
t = netdev_priv(dev);
@@ -1024,6 +1072,7 @@ ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
break;
}
ipgre_tunnel_unlink(ign, t);
+ synchronize_net();
t->parms.iph.saddr = p.iph.saddr;
t->parms.iph.daddr = p.iph.daddr;
t->parms.i_key = p.i_key;
@@ -1123,7 +1172,7 @@ static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
unsigned short type,
- const void *daddr, const void *saddr, unsigned len)
+ const void *daddr, const void *saddr, unsigned int len)
{
struct ip_tunnel *t = netdev_priv(dev);
struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
@@ -1165,16 +1214,22 @@ static int ipgre_open(struct net_device *dev)
struct ip_tunnel *t = netdev_priv(dev);
if (ipv4_is_multicast(t->parms.iph.daddr)) {
- struct flowi fl = { .oif = t->parms.link,
- .nl_u = { .ip4_u =
- { .daddr = t->parms.iph.daddr,
- .saddr = t->parms.iph.saddr,
- .tos = RT_TOS(t->parms.iph.tos) } },
- .proto = IPPROTO_GRE };
+ struct flowi fl = {
+ .oif = t->parms.link,
+ .nl_u = {
+ .ip4_u = {
+ .daddr = t->parms.iph.daddr,
+ .saddr = t->parms.iph.saddr,
+ .tos = RT_TOS(t->parms.iph.tos)
+ }
+ },
+ .proto = IPPROTO_GRE
+ };
struct rtable *rt;
+
if (ip_route_output_key(dev_net(dev), &rt, &fl))
return -EADDRNOTAVAIL;
- dev = rt->u.dst.dev;
+ dev = rt->dst.dev;
ip_rt_put(rt);
if (__in_dev_get_rtnl(dev) == NULL)
return -EADDRNOTAVAIL;
@@ -1191,10 +1246,8 @@ static int ipgre_close(struct net_device *dev)
if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
struct in_device *in_dev;
in_dev = inetdev_by_index(dev_net(dev), t->mlink);
- if (in_dev) {
+ if (in_dev)
ip_mc_dec_group(in_dev, t->parms.iph.daddr);
- in_dev_put(in_dev);
- }
}
return 0;
}
@@ -1211,12 +1264,19 @@ static const struct net_device_ops ipgre_netdev_ops = {
.ndo_start_xmit = ipgre_tunnel_xmit,
.ndo_do_ioctl = ipgre_tunnel_ioctl,
.ndo_change_mtu = ipgre_tunnel_change_mtu,
+ .ndo_get_stats = ipgre_get_stats,
};
+static void ipgre_dev_free(struct net_device *dev)
+{
+ free_percpu(dev->tstats);
+ free_netdev(dev);
+}
+
static void ipgre_tunnel_setup(struct net_device *dev)
{
dev->netdev_ops = &ipgre_netdev_ops;
- dev->destructor = free_netdev;
+ dev->destructor = ipgre_dev_free;
dev->type = ARPHRD_IPGRE;
dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
@@ -1254,6 +1314,10 @@ static int ipgre_tunnel_init(struct net_device *dev)
} else
dev->header_ops = &ipgre_header_ops;
+ dev->tstats = alloc_percpu(struct pcpu_tstats);
+ if (!dev->tstats)
+ return -ENOMEM;
+
return 0;
}
@@ -1261,7 +1325,6 @@ static void ipgre_fb_tunnel_init(struct net_device *dev)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
struct iphdr *iph = &tunnel->parms.iph;
- struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id);
tunnel->dev = dev;
strcpy(tunnel->parms.name, dev->name);
@@ -1272,14 +1335,12 @@ static void ipgre_fb_tunnel_init(struct net_device *dev)
tunnel->hlen = sizeof(struct iphdr) + 4;
dev_hold(dev);
- ign->tunnels_wc[0] = tunnel;
}
-static const struct net_protocol ipgre_protocol = {
- .handler = ipgre_rcv,
- .err_handler = ipgre_err,
- .netns_ok = 1,
+static const struct gre_protocol ipgre_protocol = {
+ .handler = ipgre_rcv,
+ .err_handler = ipgre_err,
};
static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
@@ -1289,11 +1350,13 @@ static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
for (prio = 0; prio < 4; prio++) {
int h;
for (h = 0; h < HASH_SIZE; h++) {
- struct ip_tunnel *t = ign->tunnels[prio][h];
+ struct ip_tunnel *t;
+
+ t = rtnl_dereference(ign->tunnels[prio][h]);
while (t != NULL) {
unregister_netdevice_queue(t->dev, head);
- t = t->next;
+ t = rtnl_dereference(t->next);
}
}
}
@@ -1318,10 +1381,12 @@ static int __net_init ipgre_init_net(struct net *net)
if ((err = register_netdev(ign->fb_tunnel_dev)))
goto err_reg_dev;
+ rcu_assign_pointer(ign->tunnels_wc[0],
+ netdev_priv(ign->fb_tunnel_dev));
return 0;
err_reg_dev:
- free_netdev(ign->fb_tunnel_dev);
+ ipgre_dev_free(ign->fb_tunnel_dev);
err_alloc_dev:
return err;
}
@@ -1439,6 +1504,10 @@ static int ipgre_tap_init(struct net_device *dev)
ipgre_tunnel_bind_dev(dev);
+ dev->tstats = alloc_percpu(struct pcpu_tstats);
+ if (!dev->tstats)
+ return -ENOMEM;
+
return 0;
}
@@ -1449,6 +1518,7 @@ static const struct net_device_ops ipgre_tap_netdev_ops = {
.ndo_set_mac_address = eth_mac_addr,
.ndo_validate_addr = eth_validate_addr,
.ndo_change_mtu = ipgre_tunnel_change_mtu,
+ .ndo_get_stats = ipgre_get_stats,
};
static void ipgre_tap_setup(struct net_device *dev)
@@ -1457,7 +1527,7 @@ static void ipgre_tap_setup(struct net_device *dev)
ether_setup(dev);
dev->netdev_ops = &ipgre_tap_netdev_ops;
- dev->destructor = free_netdev;
+ dev->destructor = ipgre_dev_free;
dev->iflink = 0;
dev->features |= NETIF_F_NETNS_LOCAL;
@@ -1485,6 +1555,10 @@ static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nla
if (!tb[IFLA_MTU])
dev->mtu = mtu;
+ /* Can use a lockless transmit, unless we generate output sequences */
+ if (!(nt->parms.o_flags & GRE_SEQ))
+ dev->features |= NETIF_F_LLTX;
+
err = register_netdevice(dev);
if (err)
goto out;
@@ -1520,7 +1594,7 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
t = nt;
if (dev->type != ARPHRD_ETHER) {
- unsigned nflags = 0;
+ unsigned int nflags = 0;
if (ipv4_is_multicast(p.iph.daddr))
nflags = IFF_BROADCAST;
@@ -1661,7 +1735,7 @@ static int __init ipgre_init(void)
if (err < 0)
return err;
- err = inet_add_protocol(&ipgre_protocol, IPPROTO_GRE);
+ err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
if (err < 0) {
printk(KERN_INFO "ipgre init: can't add protocol\n");
goto add_proto_failed;
@@ -1681,7 +1755,7 @@ out:
tap_ops_failed:
rtnl_link_unregister(&ipgre_link_ops);
rtnl_link_failed:
- inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
+ gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
add_proto_failed:
unregister_pernet_device(&ipgre_net_ops);
goto out;
@@ -1691,7 +1765,7 @@ static void __exit ipgre_fini(void)
{
rtnl_link_unregister(&ipgre_tap_ops);
rtnl_link_unregister(&ipgre_link_ops);
- if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
+ if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0)
printk(KERN_INFO "ipgre close: can't remove protocol\n");
unregister_pernet_device(&ipgre_net_ops);
}
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index d930dc5e4d85..d859bcc26cb7 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -146,7 +146,7 @@
#include <linux/netlink.h>
/*
- * Process Router Attention IP option
+ * Process Router Attention IP option (RFC 2113)
*/
int ip_call_ra_chain(struct sk_buff *skb)
{
@@ -155,8 +155,7 @@ int ip_call_ra_chain(struct sk_buff *skb)
struct sock *last = NULL;
struct net_device *dev = skb->dev;
- read_lock(&ip_ra_lock);
- for (ra = ip_ra_chain; ra; ra = ra->next) {
+ for (ra = rcu_dereference(ip_ra_chain); ra; ra = rcu_dereference(ra->next)) {
struct sock *sk = ra->sk;
/* If socket is bound to an interface, only report
@@ -167,10 +166,8 @@ int ip_call_ra_chain(struct sk_buff *skb)
sk->sk_bound_dev_if == dev->ifindex) &&
net_eq(sock_net(sk), dev_net(dev))) {
if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
- if (ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN)) {
- read_unlock(&ip_ra_lock);
+ if (ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN))
return 1;
- }
}
if (last) {
struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
@@ -183,10 +180,8 @@ int ip_call_ra_chain(struct sk_buff *skb)
if (last) {
raw_rcv(last, skb);
- read_unlock(&ip_ra_lock);
return 1;
}
- read_unlock(&ip_ra_lock);
return 0;
}
@@ -298,18 +293,16 @@ static inline int ip_rcv_options(struct sk_buff *skb)
}
if (unlikely(opt->srr)) {
- struct in_device *in_dev = in_dev_get(dev);
+ struct in_device *in_dev = __in_dev_get_rcu(dev);
+
if (in_dev) {
if (!IN_DEV_SOURCE_ROUTE(in_dev)) {
if (IN_DEV_LOG_MARTIANS(in_dev) &&
net_ratelimit())
printk(KERN_INFO "source route option %pI4 -> %pI4\n",
&iph->saddr, &iph->daddr);
- in_dev_put(in_dev);
goto drop;
}
-
- in_dev_put(in_dev);
}
if (ip_options_rcv_srr(skb))
@@ -340,13 +333,16 @@ static int ip_rcv_finish(struct sk_buff *skb)
else if (err == -ENETUNREACH)
IP_INC_STATS_BH(dev_net(skb->dev),
IPSTATS_MIB_INNOROUTES);
+ else if (err == -EXDEV)
+ NET_INC_STATS_BH(dev_net(skb->dev),
+ LINUX_MIB_IPRPFILTER);
goto drop;
}
}
#ifdef CONFIG_NET_CLS_ROUTE
if (unlikely(skb_dst(skb)->tclassid)) {
- struct ip_rt_acct *st = per_cpu_ptr(ip_rt_acct, smp_processor_id());
+ struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct);
u32 idx = skb_dst(skb)->tclassid;
st[idx&0xFF].o_packets++;
st[idx&0xFF].o_bytes += skb->len;
@@ -360,10 +356,10 @@ static int ip_rcv_finish(struct sk_buff *skb)
rt = skb_rtable(skb);
if (rt->rt_type == RTN_MULTICAST) {
- IP_UPD_PO_STATS_BH(dev_net(rt->u.dst.dev), IPSTATS_MIB_INMCAST,
+ IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INMCAST,
skb->len);
} else if (rt->rt_type == RTN_BROADCAST)
- IP_UPD_PO_STATS_BH(dev_net(rt->u.dst.dev), IPSTATS_MIB_INBCAST,
+ IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INBCAST,
skb->len);
return dst_input(skb);
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index ba9836c488ed..1906fa35860c 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -466,7 +466,7 @@ error:
}
return -EINVAL;
}
-
+EXPORT_SYMBOL(ip_options_compile);
/*
* Undo all the changes done by ip_options_compile().
@@ -646,3 +646,4 @@ int ip_options_rcv_srr(struct sk_buff *skb)
}
return 0;
}
+EXPORT_SYMBOL(ip_options_rcv_srr);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 041d41df1224..439d2a34ee44 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -89,6 +89,7 @@ __inline__ void ip_send_check(struct iphdr *iph)
iph->check = 0;
iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
}
+EXPORT_SYMBOL(ip_send_check);
int __ip_local_out(struct sk_buff *skb)
{
@@ -151,15 +152,15 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
iph->version = 4;
iph->ihl = 5;
iph->tos = inet->tos;
- if (ip_dont_fragment(sk, &rt->u.dst))
+ if (ip_dont_fragment(sk, &rt->dst))
iph->frag_off = htons(IP_DF);
else
iph->frag_off = 0;
- iph->ttl = ip_select_ttl(inet, &rt->u.dst);
+ iph->ttl = ip_select_ttl(inet, &rt->dst);
iph->daddr = rt->rt_dst;
iph->saddr = rt->rt_src;
iph->protocol = sk->sk_protocol;
- ip_select_ident(iph, &rt->u.dst, sk);
+ ip_select_ident(iph, &rt->dst, sk);
if (opt && opt->optlen) {
iph->ihl += opt->optlen>>2;
@@ -172,7 +173,6 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
/* Send it out. */
return ip_local_out(skb);
}
-
EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
static inline int ip_finish_output2(struct sk_buff *skb)
@@ -240,7 +240,7 @@ int ip_mc_output(struct sk_buff *skb)
{
struct sock *sk = skb->sk;
struct rtable *rt = skb_rtable(skb);
- struct net_device *dev = rt->u.dst.dev;
+ struct net_device *dev = rt->dst.dev;
/*
* If the indicated interface is up and running, send the packet.
@@ -359,9 +359,9 @@ int ip_queue_xmit(struct sk_buff *skb)
if (ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0))
goto no_route;
}
- sk_setup_caps(sk, &rt->u.dst);
+ sk_setup_caps(sk, &rt->dst);
}
- skb_dst_set_noref(skb, &rt->u.dst);
+ skb_dst_set_noref(skb, &rt->dst);
packet_routed:
if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
@@ -372,11 +372,11 @@ packet_routed:
skb_reset_network_header(skb);
iph = ip_hdr(skb);
*((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
- if (ip_dont_fragment(sk, &rt->u.dst) && !skb->local_df)
+ if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df)
iph->frag_off = htons(IP_DF);
else
iph->frag_off = 0;
- iph->ttl = ip_select_ttl(inet, &rt->u.dst);
+ iph->ttl = ip_select_ttl(inet, &rt->dst);
iph->protocol = sk->sk_protocol;
iph->saddr = rt->rt_src;
iph->daddr = rt->rt_dst;
@@ -387,7 +387,7 @@ packet_routed:
ip_options_build(skb, opt, inet->inet_daddr, rt, 0);
}
- ip_select_ident_more(iph, &rt->u.dst, sk,
+ ip_select_ident_more(iph, &rt->dst, sk,
(skb_shinfo(skb)->gso_segs ?: 1) - 1);
skb->priority = sk->sk_priority;
@@ -403,6 +403,7 @@ no_route:
kfree_skb(skb);
return -EHOSTUNREACH;
}
+EXPORT_SYMBOL(ip_queue_xmit);
static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
@@ -411,7 +412,7 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
to->priority = from->priority;
to->protocol = from->protocol;
skb_dst_drop(to);
- skb_dst_set(to, dst_clone(skb_dst(from)));
+ skb_dst_copy(to, from);
to->dev = from->dev;
to->mark = from->mark;
@@ -442,17 +443,16 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
{
struct iphdr *iph;
- int raw = 0;
int ptr;
struct net_device *dev;
struct sk_buff *skb2;
- unsigned int mtu, hlen, left, len, ll_rs, pad;
+ unsigned int mtu, hlen, left, len, ll_rs;
int offset;
__be16 not_last_frag;
struct rtable *rt = skb_rtable(skb);
int err = 0;
- dev = rt->u.dst.dev;
+ dev = rt->dst.dev;
/*
* Point into the IP datagram header.
@@ -473,7 +473,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
*/
hlen = iph->ihl * 4;
- mtu = dst_mtu(&rt->u.dst) - hlen; /* Size of data space */
+ mtu = dst_mtu(&rt->dst) - hlen; /* Size of data space */
#ifdef CONFIG_BRIDGE_NETFILTER
if (skb->nf_bridge)
mtu -= nf_bridge_mtu_reduction(skb);
@@ -487,10 +487,9 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
* LATER: this step can be merged to real generation of fragments,
* we can switch to copy when see the first bad fragment.
*/
- if (skb_has_frags(skb)) {
- struct sk_buff *frag;
+ if (skb_has_frag_list(skb)) {
+ struct sk_buff *frag, *frag2;
int first_len = skb_pagelen(skb);
- int truesizes = 0;
if (first_len - hlen > mtu ||
((first_len - hlen) & 7) ||
@@ -503,18 +502,18 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
if (frag->len > mtu ||
((frag->len & 7) && frag->next) ||
skb_headroom(frag) < hlen)
- goto slow_path;
+ goto slow_path_clean;
/* Partially cloned skb? */
if (skb_shared(frag))
- goto slow_path;
+ goto slow_path_clean;
BUG_ON(frag->sk);
if (skb->sk) {
frag->sk = skb->sk;
frag->destructor = sock_wfree;
}
- truesizes += frag->truesize;
+ skb->truesize -= frag->truesize;
}
/* Everything is OK. Generate! */
@@ -524,7 +523,6 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
frag = skb_shinfo(skb)->frag_list;
skb_frag_list_init(skb);
skb->data_len = first_len - skb_headlen(skb);
- skb->truesize -= truesizes;
skb->len = first_len;
iph->tot_len = htons(first_len);
iph->frag_off = htons(IP_MF);
@@ -576,18 +574,25 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
}
IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
return err;
+
+slow_path_clean:
+ skb_walk_frags(skb, frag2) {
+ if (frag2 == frag)
+ break;
+ frag2->sk = NULL;
+ frag2->destructor = NULL;
+ skb->truesize += frag2->truesize;
+ }
}
slow_path:
left = skb->len - hlen; /* Space per frame */
- ptr = raw + hlen; /* Where to start from */
+ ptr = hlen; /* Where to start from */
/* for bridged IP traffic encapsulated inside f.e. a vlan header,
* we need to make room for the encapsulating header
*/
- pad = nf_bridge_pad(skb);
- ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, pad);
- mtu -= pad;
+ ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb));
/*
* Fragment the datagram.
@@ -697,7 +702,6 @@ fail:
IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
return err;
}
-
EXPORT_SYMBOL(ip_fragment);
int
@@ -716,6 +720,7 @@ ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk
}
return 0;
}
+EXPORT_SYMBOL(ip_generic_getfrag);
static inline __wsum
csum_page(struct page *page, int offset, int copy)
@@ -833,16 +838,15 @@ int ip_append_data(struct sock *sk,
*/
*rtp = NULL;
inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ?
- rt->u.dst.dev->mtu :
- dst_mtu(rt->u.dst.path);
- inet->cork.dst = &rt->u.dst;
+ rt->dst.dev->mtu :
+ dst_mtu(rt->dst.path);
+ inet->cork.dst = &rt->dst;
inet->cork.length = 0;
sk->sk_sndmsg_page = NULL;
sk->sk_sndmsg_off = 0;
- if ((exthdrlen = rt->u.dst.header_len) != 0) {
- length += exthdrlen;
- transhdrlen += exthdrlen;
- }
+ exthdrlen = rt->dst.header_len;
+ length += exthdrlen;
+ transhdrlen += exthdrlen;
} else {
rt = (struct rtable *)inet->cork.dst;
if (inet->cork.flags & IPCORK_OPT)
@@ -852,7 +856,7 @@ int ip_append_data(struct sock *sk,
exthdrlen = 0;
mtu = inet->cork.fragsize;
}
- hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
+ hh_len = LL_RESERVED_SPACE(rt->dst.dev);
fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
@@ -869,7 +873,7 @@ int ip_append_data(struct sock *sk,
*/
if (transhdrlen &&
length + fragheaderlen <= mtu &&
- rt->u.dst.dev->features & NETIF_F_V4_CSUM &&
+ rt->dst.dev->features & NETIF_F_V4_CSUM &&
!exthdrlen)
csummode = CHECKSUM_PARTIAL;
@@ -878,7 +882,7 @@ int ip_append_data(struct sock *sk,
inet->cork.length += length;
if (((length > mtu) || (skb && skb_is_gso(skb))) &&
(sk->sk_protocol == IPPROTO_UDP) &&
- (rt->u.dst.dev->features & NETIF_F_UFO)) {
+ (rt->dst.dev->features & NETIF_F_UFO)) {
err = ip_ufo_append_data(sk, getfrag, from, length, hh_len,
fragheaderlen, transhdrlen, mtu,
flags);
@@ -926,19 +930,22 @@ alloc_new_skb:
fraglen = datalen + fragheaderlen;
if ((flags & MSG_MORE) &&
- !(rt->u.dst.dev->features&NETIF_F_SG))
+ !(rt->dst.dev->features&NETIF_F_SG))
alloclen = mtu;
else
- alloclen = datalen + fragheaderlen;
+ alloclen = fraglen;
/* The last fragment gets additional space at tail.
* Note, with MSG_MORE we overallocate on fragments,
* because we have no idea what fragment will be
* the last.
*/
- if (datalen == length + fraggap)
- alloclen += rt->u.dst.trailer_len;
-
+ if (datalen == length + fraggap) {
+ alloclen += rt->dst.trailer_len;
+ /* make sure mtu is not reached */
+ if (datalen > mtu - fragheaderlen - rt->dst.trailer_len)
+ datalen -= ALIGN(rt->dst.trailer_len, 8);
+ }
if (transhdrlen) {
skb = sock_alloc_send_skb(sk,
alloclen + hh_len + 15,
@@ -955,7 +962,7 @@ alloc_new_skb:
else
/* only the initial fragment is
time stamped */
- ipc->shtx.flags = 0;
+ ipc->tx_flags = 0;
}
if (skb == NULL)
goto error;
@@ -966,7 +973,7 @@ alloc_new_skb:
skb->ip_summed = csummode;
skb->csum = 0;
skb_reserve(skb, hh_len);
- *skb_tx(skb) = ipc->shtx;
+ skb_shinfo(skb)->tx_flags = ipc->tx_flags;
/*
* Find where to start putting bytes.
@@ -1010,7 +1017,7 @@ alloc_new_skb:
if (copy > length)
copy = length;
- if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
+ if (!(rt->dst.dev->features&NETIF_F_SG)) {
unsigned int off;
off = skb->len;
@@ -1105,10 +1112,10 @@ ssize_t ip_append_page(struct sock *sk, struct page *page,
if (inet->cork.flags & IPCORK_OPT)
opt = inet->cork.opt;
- if (!(rt->u.dst.dev->features&NETIF_F_SG))
+ if (!(rt->dst.dev->features&NETIF_F_SG))
return -EOPNOTSUPP;
- hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
+ hh_len = LL_RESERVED_SPACE(rt->dst.dev);
mtu = inet->cork.fragsize;
fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
@@ -1125,7 +1132,7 @@ ssize_t ip_append_page(struct sock *sk, struct page *page,
inet->cork.length += size;
if ((size + skb->len > mtu) &&
(sk->sk_protocol == IPPROTO_UDP) &&
- (rt->u.dst.dev->features & NETIF_F_UFO)) {
+ (rt->dst.dev->features & NETIF_F_UFO)) {
skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
}
@@ -1277,8 +1284,8 @@ int ip_push_pending_frames(struct sock *sk)
* If local_df is set too, we still allow to fragment this frame
* locally. */
if (inet->pmtudisc >= IP_PMTUDISC_DO ||
- (skb->len <= dst_mtu(&rt->u.dst) &&
- ip_dont_fragment(sk, &rt->u.dst)))
+ (skb->len <= dst_mtu(&rt->dst) &&
+ ip_dont_fragment(sk, &rt->dst)))
df = htons(IP_DF);
if (inet->cork.flags & IPCORK_OPT)
@@ -1287,7 +1294,7 @@ int ip_push_pending_frames(struct sock *sk)
if (rt->rt_type == RTN_MULTICAST)
ttl = inet->mc_ttl;
else
- ttl = ip_select_ttl(inet, &rt->u.dst);
+ ttl = ip_select_ttl(inet, &rt->dst);
iph = (struct iphdr *)skb->data;
iph->version = 4;
@@ -1298,7 +1305,7 @@ int ip_push_pending_frames(struct sock *sk)
}
iph->tos = inet->tos;
iph->frag_off = df;
- ip_select_ident(iph, &rt->u.dst, sk);
+ ip_select_ident(iph, &rt->dst, sk);
iph->ttl = ttl;
iph->protocol = sk->sk_protocol;
iph->saddr = rt->rt_src;
@@ -1311,7 +1318,7 @@ int ip_push_pending_frames(struct sock *sk)
* on dst refcount
*/
inet->cork.dst = NULL;
- skb_dst_set(skb, &rt->u.dst);
+ skb_dst_set(skb, &rt->dst);
if (iph->protocol == IPPROTO_ICMP)
icmp_out_count(net, ((struct icmphdr *)
@@ -1386,7 +1393,7 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar
daddr = ipc.addr = rt->rt_src;
ipc.opt = NULL;
- ipc.shtx.flags = 0;
+ ipc.tx_flags = 0;
if (replyopts.opt.optlen) {
ipc.opt = &replyopts.opt;
@@ -1448,7 +1455,3 @@ void __init ip_init(void)
igmp_mc_proc_init();
#endif
}
-
-EXPORT_SYMBOL(ip_generic_getfrag);
-EXPORT_SYMBOL(ip_queue_xmit);
-EXPORT_SYMBOL(ip_send_check);
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index ce231780a2b1..3948c86e59ca 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -238,48 +238,68 @@ int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc)
but receiver should be enough clever f.e. to forward mtrace requests,
sent to multicast group to reach destination designated router.
*/
-struct ip_ra_chain *ip_ra_chain;
-DEFINE_RWLOCK(ip_ra_lock);
+struct ip_ra_chain __rcu *ip_ra_chain;
+static DEFINE_SPINLOCK(ip_ra_lock);
+
+
+static void ip_ra_destroy_rcu(struct rcu_head *head)
+{
+ struct ip_ra_chain *ra = container_of(head, struct ip_ra_chain, rcu);
+
+ sock_put(ra->saved_sk);
+ kfree(ra);
+}
int ip_ra_control(struct sock *sk, unsigned char on,
void (*destructor)(struct sock *))
{
- struct ip_ra_chain *ra, *new_ra, **rap;
+ struct ip_ra_chain *ra, *new_ra;
+ struct ip_ra_chain __rcu **rap;
if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num == IPPROTO_RAW)
return -EINVAL;
new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL;
- write_lock_bh(&ip_ra_lock);
- for (rap = &ip_ra_chain; (ra = *rap) != NULL; rap = &ra->next) {
+ spin_lock_bh(&ip_ra_lock);
+ for (rap = &ip_ra_chain;
+ (ra = rcu_dereference_protected(*rap,
+ lockdep_is_held(&ip_ra_lock))) != NULL;
+ rap = &ra->next) {
if (ra->sk == sk) {
if (on) {
- write_unlock_bh(&ip_ra_lock);
+ spin_unlock_bh(&ip_ra_lock);
kfree(new_ra);
return -EADDRINUSE;
}
- *rap = ra->next;
- write_unlock_bh(&ip_ra_lock);
+ /* dont let ip_call_ra_chain() use sk again */
+ ra->sk = NULL;
+ rcu_assign_pointer(*rap, ra->next);
+ spin_unlock_bh(&ip_ra_lock);
if (ra->destructor)
ra->destructor(sk);
- sock_put(sk);
- kfree(ra);
+ /*
+ * Delay sock_put(sk) and kfree(ra) after one rcu grace
+ * period. This guarantee ip_call_ra_chain() dont need
+ * to mess with socket refcounts.
+ */
+ ra->saved_sk = sk;
+ call_rcu(&ra->rcu, ip_ra_destroy_rcu);
return 0;
}
}
if (new_ra == NULL) {
- write_unlock_bh(&ip_ra_lock);
+ spin_unlock_bh(&ip_ra_lock);
return -ENOBUFS;
}
new_ra->sk = sk;
new_ra->destructor = destructor;
new_ra->next = ra;
- *rap = new_ra;
+ rcu_assign_pointer(*rap, new_ra);
sock_hold(sk);
- write_unlock_bh(&ip_ra_lock);
+ spin_unlock_bh(&ip_ra_lock);
return 0;
}
@@ -449,7 +469,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
(1<<IP_MTU_DISCOVER) | (1<<IP_RECVERR) |
(1<<IP_ROUTER_ALERT) | (1<<IP_FREEBIND) |
(1<<IP_PASSSEC) | (1<<IP_TRANSPARENT) |
- (1<<IP_MINTTL))) ||
+ (1<<IP_MINTTL) | (1<<IP_NODEFRAG))) ||
optname == IP_MULTICAST_TTL ||
optname == IP_MULTICAST_ALL ||
optname == IP_MULTICAST_LOOP ||
@@ -572,6 +592,13 @@ static int do_ip_setsockopt(struct sock *sk, int level,
}
inet->hdrincl = val ? 1 : 0;
break;
+ case IP_NODEFRAG:
+ if (sk->sk_type != SOCK_RAW) {
+ err = -ENOPROTOOPT;
+ break;
+ }
+ inet->nodefrag = val ? 1 : 0;
+ break;
case IP_MTU_DISCOVER:
if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_PROBE)
goto e_inval;
@@ -1106,6 +1133,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
case IP_HDRINCL:
val = inet->hdrincl;
break;
+ case IP_NODEFRAG:
+ val = inet->nodefrag;
+ break;
case IP_MTU_DISCOVER:
val = inet->pmtudisc;
break;
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index b9d84e800cf4..3a6e1ec5e9ae 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -665,6 +665,13 @@ ic_dhcp_init_options(u8 *options)
memcpy(e, ic_req_params, sizeof(ic_req_params));
e += sizeof(ic_req_params);
+ if (ic_host_name_set) {
+ *e++ = 12; /* host-name */
+ len = strlen(utsname()->nodename);
+ *e++ = len;
+ memcpy(e, utsname()->nodename, len);
+ e += len;
+ }
if (*vendor_class_identifier) {
printk(KERN_INFO "DHCP: sending class identifier \"%s\"\n",
vendor_class_identifier);
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 7fd636711037..cd300aaee78f 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -122,31 +122,59 @@
static int ipip_net_id __read_mostly;
struct ipip_net {
- struct ip_tunnel *tunnels_r_l[HASH_SIZE];
- struct ip_tunnel *tunnels_r[HASH_SIZE];
- struct ip_tunnel *tunnels_l[HASH_SIZE];
- struct ip_tunnel *tunnels_wc[1];
- struct ip_tunnel **tunnels[4];
+ struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE];
+ struct ip_tunnel __rcu *tunnels_r[HASH_SIZE];
+ struct ip_tunnel __rcu *tunnels_l[HASH_SIZE];
+ struct ip_tunnel __rcu *tunnels_wc[1];
+ struct ip_tunnel __rcu **tunnels[4];
struct net_device *fb_tunnel_dev;
};
-static void ipip_tunnel_init(struct net_device *dev);
+static int ipip_tunnel_init(struct net_device *dev);
static void ipip_tunnel_setup(struct net_device *dev);
+static void ipip_dev_free(struct net_device *dev);
/*
- * Locking : hash tables are protected by RCU and a spinlock
+ * Locking : hash tables are protected by RCU and RTNL
*/
-static DEFINE_SPINLOCK(ipip_lock);
#define for_each_ip_tunnel_rcu(start) \
for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
+/* often modified stats are per cpu, other are shared (netdev->stats) */
+struct pcpu_tstats {
+ unsigned long rx_packets;
+ unsigned long rx_bytes;
+ unsigned long tx_packets;
+ unsigned long tx_bytes;
+};
+
+static struct net_device_stats *ipip_get_stats(struct net_device *dev)
+{
+ struct pcpu_tstats sum = { 0 };
+ int i;
+
+ for_each_possible_cpu(i) {
+ const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
+
+ sum.rx_packets += tstats->rx_packets;
+ sum.rx_bytes += tstats->rx_bytes;
+ sum.tx_packets += tstats->tx_packets;
+ sum.tx_bytes += tstats->tx_bytes;
+ }
+ dev->stats.rx_packets = sum.rx_packets;
+ dev->stats.rx_bytes = sum.rx_bytes;
+ dev->stats.tx_packets = sum.tx_packets;
+ dev->stats.tx_bytes = sum.tx_bytes;
+ return &dev->stats;
+}
+
static struct ip_tunnel * ipip_tunnel_lookup(struct net *net,
__be32 remote, __be32 local)
{
- unsigned h0 = HASH(remote);
- unsigned h1 = HASH(local);
+ unsigned int h0 = HASH(remote);
+ unsigned int h1 = HASH(local);
struct ip_tunnel *t;
struct ipip_net *ipn = net_generic(net, ipip_net_id);
@@ -169,12 +197,12 @@ static struct ip_tunnel * ipip_tunnel_lookup(struct net *net,
return NULL;
}
-static struct ip_tunnel **__ipip_bucket(struct ipip_net *ipn,
+static struct ip_tunnel __rcu **__ipip_bucket(struct ipip_net *ipn,
struct ip_tunnel_parm *parms)
{
__be32 remote = parms->iph.daddr;
__be32 local = parms->iph.saddr;
- unsigned h = 0;
+ unsigned int h = 0;
int prio = 0;
if (remote) {
@@ -188,7 +216,7 @@ static struct ip_tunnel **__ipip_bucket(struct ipip_net *ipn,
return &ipn->tunnels[prio][h];
}
-static inline struct ip_tunnel **ipip_bucket(struct ipip_net *ipn,
+static inline struct ip_tunnel __rcu **ipip_bucket(struct ipip_net *ipn,
struct ip_tunnel *t)
{
return __ipip_bucket(ipn, &t->parms);
@@ -196,13 +224,14 @@ static inline struct ip_tunnel **ipip_bucket(struct ipip_net *ipn,
static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t)
{
- struct ip_tunnel **tp;
-
- for (tp = ipip_bucket(ipn, t); *tp; tp = &(*tp)->next) {
- if (t == *tp) {
- spin_lock_bh(&ipip_lock);
- *tp = t->next;
- spin_unlock_bh(&ipip_lock);
+ struct ip_tunnel __rcu **tp;
+ struct ip_tunnel *iter;
+
+ for (tp = ipip_bucket(ipn, t);
+ (iter = rtnl_dereference(*tp)) != NULL;
+ tp = &iter->next) {
+ if (t == iter) {
+ rcu_assign_pointer(*tp, t->next);
break;
}
}
@@ -210,12 +239,10 @@ static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t)
static void ipip_tunnel_link(struct ipip_net *ipn, struct ip_tunnel *t)
{
- struct ip_tunnel **tp = ipip_bucket(ipn, t);
+ struct ip_tunnel __rcu **tp = ipip_bucket(ipn, t);
- spin_lock_bh(&ipip_lock);
- t->next = *tp;
+ rcu_assign_pointer(t->next, rtnl_dereference(*tp));
rcu_assign_pointer(*tp, t);
- spin_unlock_bh(&ipip_lock);
}
static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
@@ -223,12 +250,15 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
{
__be32 remote = parms->iph.daddr;
__be32 local = parms->iph.saddr;
- struct ip_tunnel *t, **tp, *nt;
+ struct ip_tunnel *t, *nt;
+ struct ip_tunnel __rcu **tp;
struct net_device *dev;
char name[IFNAMSIZ];
struct ipip_net *ipn = net_generic(net, ipip_net_id);
- for (tp = __ipip_bucket(ipn, parms); (t = *tp) != NULL; tp = &t->next) {
+ for (tp = __ipip_bucket(ipn, parms);
+ (t = rtnl_dereference(*tp)) != NULL;
+ tp = &t->next) {
if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr)
return t;
}
@@ -238,7 +268,7 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
if (parms->name[0])
strlcpy(name, parms->name, IFNAMSIZ);
else
- sprintf(name, "tunl%%d");
+ strcpy(name, "tunl%d");
dev = alloc_netdev(sizeof(*t), name, ipip_tunnel_setup);
if (dev == NULL)
@@ -254,7 +284,8 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
nt = netdev_priv(dev);
nt->parms = *parms;
- ipip_tunnel_init(dev);
+ if (ipip_tunnel_init(dev) < 0)
+ goto failed_free;
if (register_netdevice(dev) < 0)
goto failed_free;
@@ -264,20 +295,19 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
return nt;
failed_free:
- free_netdev(dev);
+ ipip_dev_free(dev);
return NULL;
}
+/* called with RTNL */
static void ipip_tunnel_uninit(struct net_device *dev)
{
struct net *net = dev_net(dev);
struct ipip_net *ipn = net_generic(net, ipip_net_id);
- if (dev == ipn->fb_tunnel_dev) {
- spin_lock_bh(&ipip_lock);
- ipn->tunnels_wc[0] = NULL;
- spin_unlock_bh(&ipip_lock);
- } else
+ if (dev == ipn->fb_tunnel_dev)
+ rcu_assign_pointer(ipn->tunnels_wc[0], NULL);
+ else
ipip_tunnel_unlink(ipn, netdev_priv(dev));
dev_put(dev);
}
@@ -359,8 +389,10 @@ static int ipip_rcv(struct sk_buff *skb)
const struct iphdr *iph = ip_hdr(skb);
rcu_read_lock();
- if ((tunnel = ipip_tunnel_lookup(dev_net(skb->dev),
- iph->saddr, iph->daddr)) != NULL) {
+ tunnel = ipip_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr);
+ if (tunnel != NULL) {
+ struct pcpu_tstats *tstats;
+
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
rcu_read_unlock();
kfree_skb(skb);
@@ -374,10 +406,16 @@ static int ipip_rcv(struct sk_buff *skb)
skb->protocol = htons(ETH_P_IP);
skb->pkt_type = PACKET_HOST;
- skb_tunnel_rx(skb, tunnel->dev);
+ tstats = this_cpu_ptr(tunnel->dev->tstats);
+ tstats->rx_packets++;
+ tstats->rx_bytes += skb->len;
+
+ __skb_tunnel_rx(skb, tunnel->dev);
ipip_ecn_decapsulate(iph, skb);
+
netif_rx(skb);
+
rcu_read_unlock();
return 0;
}
@@ -394,13 +432,12 @@ static int ipip_rcv(struct sk_buff *skb)
static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
- struct net_device_stats *stats = &dev->stats;
- struct netdev_queue *txq = netdev_get_tx_queue(dev, 0);
+ struct pcpu_tstats *tstats;
struct iphdr *tiph = &tunnel->parms.iph;
u8 tos = tunnel->parms.iph.tos;
__be16 df = tiph->frag_off;
struct rtable *rt; /* Route to the other host */
- struct net_device *tdev; /* Device to other host */
+ struct net_device *tdev; /* Device to other host */
struct iphdr *old_iph = ip_hdr(skb);
struct iphdr *iph; /* Our new IP header */
unsigned int max_headroom; /* The extra header space needed */
@@ -410,13 +447,13 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
if (skb->protocol != htons(ETH_P_IP))
goto tx_error;
- if (tos&1)
+ if (tos & 1)
tos = old_iph->tos;
if (!dst) {
/* NBMA tunnel */
if ((rt = skb_rtable(skb)) == NULL) {
- stats->tx_fifo_errors++;
+ dev->stats.tx_fifo_errors++;
goto tx_error;
}
if ((dst = rt->rt_gateway) == 0)
@@ -424,32 +461,38 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
}
{
- struct flowi fl = { .oif = tunnel->parms.link,
- .nl_u = { .ip4_u =
- { .daddr = dst,
- .saddr = tiph->saddr,
- .tos = RT_TOS(tos) } },
- .proto = IPPROTO_IPIP };
+ struct flowi fl = {
+ .oif = tunnel->parms.link,
+ .nl_u = {
+ .ip4_u = {
+ .daddr = dst,
+ .saddr = tiph->saddr,
+ .tos = RT_TOS(tos)
+ }
+ },
+ .proto = IPPROTO_IPIP
+ };
+
if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
- stats->tx_carrier_errors++;
+ dev->stats.tx_carrier_errors++;
goto tx_error_icmp;
}
}
- tdev = rt->u.dst.dev;
+ tdev = rt->dst.dev;
if (tdev == dev) {
ip_rt_put(rt);
- stats->collisions++;
+ dev->stats.collisions++;
goto tx_error;
}
df |= old_iph->frag_off & htons(IP_DF);
if (df) {
- mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr);
+ mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
if (mtu < 68) {
- stats->collisions++;
+ dev->stats.collisions++;
ip_rt_put(rt);
goto tx_error;
}
@@ -485,7 +528,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
if (!new_skb) {
ip_rt_put(rt);
- txq->tx_dropped++;
+ dev->stats.tx_dropped++;
dev_kfree_skb(skb);
return NETDEV_TX_OK;
}
@@ -503,7 +546,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
IPSKB_REROUTED);
skb_dst_drop(skb);
- skb_dst_set(skb, &rt->u.dst);
+ skb_dst_set(skb, &rt->dst);
/*
* Push down and install the IPIP header.
@@ -522,14 +565,14 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
iph->ttl = old_iph->ttl;
nf_reset(skb);
-
- IPTUNNEL_XMIT();
+ tstats = this_cpu_ptr(dev->tstats);
+ __IPTUNNEL_XMIT(tstats, &dev->stats);
return NETDEV_TX_OK;
tx_error_icmp:
dst_link_failure(skb);
tx_error:
- stats->tx_errors++;
+ dev->stats.tx_errors++;
dev_kfree_skb(skb);
return NETDEV_TX_OK;
}
@@ -544,15 +587,21 @@ static void ipip_tunnel_bind_dev(struct net_device *dev)
iph = &tunnel->parms.iph;
if (iph->daddr) {
- struct flowi fl = { .oif = tunnel->parms.link,
- .nl_u = { .ip4_u =
- { .daddr = iph->daddr,
- .saddr = iph->saddr,
- .tos = RT_TOS(iph->tos) } },
- .proto = IPPROTO_IPIP };
+ struct flowi fl = {
+ .oif = tunnel->parms.link,
+ .nl_u = {
+ .ip4_u = {
+ .daddr = iph->daddr,
+ .saddr = iph->saddr,
+ .tos = RT_TOS(iph->tos)
+ }
+ },
+ .proto = IPPROTO_IPIP
+ };
struct rtable *rt;
+
if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
- tdev = rt->u.dst.dev;
+ tdev = rt->dst.dev;
ip_rt_put(rt);
}
dev->flags |= IFF_POINTOPOINT;
@@ -627,6 +676,7 @@ ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
}
t = netdev_priv(dev);
ipip_tunnel_unlink(ipn, t);
+ synchronize_net();
t->parms.iph.saddr = p.iph.saddr;
t->parms.iph.daddr = p.iph.daddr;
memcpy(dev->dev_addr, &p.iph.saddr, 4);
@@ -696,13 +746,19 @@ static const struct net_device_ops ipip_netdev_ops = {
.ndo_start_xmit = ipip_tunnel_xmit,
.ndo_do_ioctl = ipip_tunnel_ioctl,
.ndo_change_mtu = ipip_tunnel_change_mtu,
-
+ .ndo_get_stats = ipip_get_stats,
};
+static void ipip_dev_free(struct net_device *dev)
+{
+ free_percpu(dev->tstats);
+ free_netdev(dev);
+}
+
static void ipip_tunnel_setup(struct net_device *dev)
{
dev->netdev_ops = &ipip_netdev_ops;
- dev->destructor = free_netdev;
+ dev->destructor = ipip_dev_free;
dev->type = ARPHRD_TUNNEL;
dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr);
@@ -711,10 +767,11 @@ static void ipip_tunnel_setup(struct net_device *dev)
dev->iflink = 0;
dev->addr_len = 4;
dev->features |= NETIF_F_NETNS_LOCAL;
+ dev->features |= NETIF_F_LLTX;
dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
}
-static void ipip_tunnel_init(struct net_device *dev)
+static int ipip_tunnel_init(struct net_device *dev)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
@@ -725,9 +782,15 @@ static void ipip_tunnel_init(struct net_device *dev)
memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
ipip_tunnel_bind_dev(dev);
+
+ dev->tstats = alloc_percpu(struct pcpu_tstats);
+ if (!dev->tstats)
+ return -ENOMEM;
+
+ return 0;
}
-static void __net_init ipip_fb_tunnel_init(struct net_device *dev)
+static int __net_init ipip_fb_tunnel_init(struct net_device *dev)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
struct iphdr *iph = &tunnel->parms.iph;
@@ -740,11 +803,16 @@ static void __net_init ipip_fb_tunnel_init(struct net_device *dev)
iph->protocol = IPPROTO_IPIP;
iph->ihl = 5;
+ dev->tstats = alloc_percpu(struct pcpu_tstats);
+ if (!dev->tstats)
+ return -ENOMEM;
+
dev_hold(dev);
- ipn->tunnels_wc[0] = tunnel;
+ rcu_assign_pointer(ipn->tunnels_wc[0], tunnel);
+ return 0;
}
-static struct xfrm_tunnel ipip_handler = {
+static struct xfrm_tunnel ipip_handler __read_mostly = {
.handler = ipip_rcv,
.err_handler = ipip_err,
.priority = 1,
@@ -760,11 +828,12 @@ static void ipip_destroy_tunnels(struct ipip_net *ipn, struct list_head *head)
for (prio = 1; prio < 4; prio++) {
int h;
for (h = 0; h < HASH_SIZE; h++) {
- struct ip_tunnel *t = ipn->tunnels[prio][h];
+ struct ip_tunnel *t;
+ t = rtnl_dereference(ipn->tunnels[prio][h]);
while (t != NULL) {
unregister_netdevice_queue(t->dev, head);
- t = t->next;
+ t = rtnl_dereference(t->next);
}
}
}
@@ -789,7 +858,9 @@ static int __net_init ipip_init_net(struct net *net)
}
dev_net_set(ipn->fb_tunnel_dev, net);
- ipip_fb_tunnel_init(ipn->fb_tunnel_dev);
+ err = ipip_fb_tunnel_init(ipn->fb_tunnel_dev);
+ if (err)
+ goto err_reg_dev;
if ((err = register_netdev(ipn->fb_tunnel_dev)))
goto err_reg_dev;
@@ -797,7 +868,7 @@ static int __net_init ipip_init_net(struct net *net)
return 0;
err_reg_dev:
- free_netdev(ipn->fb_tunnel_dev);
+ ipip_dev_free(ipn->fb_tunnel_dev);
err_alloc_dev:
/* nothing */
return err;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 7f6273506eea..86dd5691af46 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -75,7 +75,7 @@ struct mr_table {
struct net *net;
#endif
u32 id;
- struct sock *mroute_sk;
+ struct sock __rcu *mroute_sk;
struct timer_list ipmr_expire_timer;
struct list_head mfc_unres_queue;
struct list_head mfc_cache_array[MFC_LINES];
@@ -98,7 +98,7 @@ struct ipmr_result {
};
/* Big lock, protecting vif table, mrt cache and mroute socket state.
- Note that the changes are semaphored via rtnl_lock.
+ * Note that the changes are semaphored via rtnl_lock.
*/
static DEFINE_RWLOCK(mrt_lock);
@@ -113,11 +113,11 @@ static DEFINE_RWLOCK(mrt_lock);
static DEFINE_SPINLOCK(mfc_unres_lock);
/* We return to original Alan's scheme. Hash table of resolved
- entries is changed only in process context and protected
- with weak lock mrt_lock. Queue of unresolved entries is protected
- with strong spinlock mfc_unres_lock.
-
- In this case data path is free of exclusive locks at all.
+ * entries is changed only in process context and protected
+ * with weak lock mrt_lock. Queue of unresolved entries is protected
+ * with strong spinlock mfc_unres_lock.
+ *
+ * In this case data path is free of exclusive locks at all.
*/
static struct kmem_cache *mrt_cachep __read_mostly;
@@ -396,9 +396,9 @@ struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v)
set_fs(KERNEL_DS);
err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL);
set_fs(oldfs);
- } else
+ } else {
err = -EOPNOTSUPP;
-
+ }
dev = NULL;
if (err == 0 &&
@@ -495,7 +495,8 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
dev->iflink = 0;
rcu_read_lock();
- if ((in_dev = __in_dev_get_rcu(dev)) == NULL) {
+ in_dev = __in_dev_get_rcu(dev);
+ if (!in_dev) {
rcu_read_unlock();
goto failure;
}
@@ -552,9 +553,10 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify,
mrt->mroute_reg_vif_num = -1;
#endif
- if (vifi+1 == mrt->maxvif) {
+ if (vifi + 1 == mrt->maxvif) {
int tmp;
- for (tmp=vifi-1; tmp>=0; tmp--) {
+
+ for (tmp = vifi - 1; tmp >= 0; tmp--) {
if (VIF_EXISTS(mrt, tmp))
break;
}
@@ -565,25 +567,33 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify,
dev_set_allmulti(dev, -1);
- if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) {
+ in_dev = __in_dev_get_rtnl(dev);
+ if (in_dev) {
IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--;
ip_rt_multicast_event(in_dev);
}
- if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER) && !notify)
+ if (v->flags & (VIFF_TUNNEL | VIFF_REGISTER) && !notify)
unregister_netdevice_queue(dev, head);
dev_put(dev);
return 0;
}
-static inline void ipmr_cache_free(struct mfc_cache *c)
+static void ipmr_cache_free_rcu(struct rcu_head *head)
{
+ struct mfc_cache *c = container_of(head, struct mfc_cache, rcu);
+
kmem_cache_free(mrt_cachep, c);
}
+static inline void ipmr_cache_free(struct mfc_cache *c)
+{
+ call_rcu(&c->rcu, ipmr_cache_free_rcu);
+}
+
/* Destroy an unresolved cache entry, killing queued skbs
- and reporting error to netlink readers.
+ * and reporting error to netlink readers.
*/
static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c)
@@ -605,8 +615,9 @@ static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c)
memset(&e->msg, 0, sizeof(e->msg));
rtnl_unicast(skb, net, NETLINK_CB(skb).pid);
- } else
+ } else {
kfree_skb(skb);
+ }
}
ipmr_cache_free(c);
@@ -724,13 +735,13 @@ static int vif_add(struct net *net, struct mr_table *mrt,
case 0:
if (vifc->vifc_flags == VIFF_USE_IFINDEX) {
dev = dev_get_by_index(net, vifc->vifc_lcl_ifindex);
- if (dev && dev->ip_ptr == NULL) {
+ if (dev && __in_dev_get_rtnl(dev) == NULL) {
dev_put(dev);
return -EADDRNOTAVAIL;
}
- } else
+ } else {
dev = ip_dev_find(net, vifc->vifc_lcl_addr.s_addr);
-
+ }
if (!dev)
return -EADDRNOTAVAIL;
err = dev_set_allmulti(dev, 1);
@@ -743,16 +754,16 @@ static int vif_add(struct net *net, struct mr_table *mrt,
return -EINVAL;
}
- if ((in_dev = __in_dev_get_rtnl(dev)) == NULL) {
+ in_dev = __in_dev_get_rtnl(dev);
+ if (!in_dev) {
dev_put(dev);
return -EADDRNOTAVAIL;
}
IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++;
ip_rt_multicast_event(in_dev);
- /*
- * Fill in the VIF structures
- */
+ /* Fill in the VIF structures */
+
v->rate_limit = vifc->vifc_rate_limit;
v->local = vifc->vifc_lcl_addr.s_addr;
v->remote = vifc->vifc_rmt_addr.s_addr;
@@ -765,14 +776,14 @@ static int vif_add(struct net *net, struct mr_table *mrt,
v->pkt_in = 0;
v->pkt_out = 0;
v->link = dev->ifindex;
- if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
+ if (v->flags & (VIFF_TUNNEL | VIFF_REGISTER))
v->link = dev->iflink;
/* And finish update writing critical data */
write_lock_bh(&mrt_lock);
v->dev = dev;
#ifdef CONFIG_IP_PIMSM
- if (v->flags&VIFF_REGISTER)
+ if (v->flags & VIFF_REGISTER)
mrt->mroute_reg_vif_num = vifi;
#endif
if (vifi+1 > mrt->maxvif)
@@ -781,6 +792,7 @@ static int vif_add(struct net *net, struct mr_table *mrt,
return 0;
}
+/* called with rcu_read_lock() */
static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
__be32 origin,
__be32 mcastgrp)
@@ -788,7 +800,7 @@ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
int line = MFC_HASH(mcastgrp, origin);
struct mfc_cache *c;
- list_for_each_entry(c, &mrt->mfc_cache_array[line], list) {
+ list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list) {
if (c->mfc_origin == origin && c->mfc_mcastgrp == mcastgrp)
return c;
}
@@ -801,19 +813,20 @@ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
static struct mfc_cache *ipmr_cache_alloc(void)
{
struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
- if (c == NULL)
- return NULL;
- c->mfc_un.res.minvif = MAXVIFS;
+
+ if (c)
+ c->mfc_un.res.minvif = MAXVIFS;
return c;
}
static struct mfc_cache *ipmr_cache_alloc_unres(void)
{
struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
- if (c == NULL)
- return NULL;
- skb_queue_head_init(&c->mfc_un.unres.unresolved);
- c->mfc_un.unres.expires = jiffies + 10*HZ;
+
+ if (c) {
+ skb_queue_head_init(&c->mfc_un.unres.unresolved);
+ c->mfc_un.unres.expires = jiffies + 10*HZ;
+ }
return c;
}
@@ -827,17 +840,15 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
struct sk_buff *skb;
struct nlmsgerr *e;
- /*
- * Play the pending entries through our router
- */
+ /* Play the pending entries through our router */
while ((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) {
if (ip_hdr(skb)->version == 0) {
struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
if (__ipmr_fill_mroute(mrt, skb, c, NLMSG_DATA(nlh)) > 0) {
- nlh->nlmsg_len = (skb_tail_pointer(skb) -
- (u8 *)nlh);
+ nlh->nlmsg_len = skb_tail_pointer(skb) -
+ (u8 *)nlh;
} else {
nlh->nlmsg_type = NLMSG_ERROR;
nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
@@ -848,8 +859,9 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
}
rtnl_unicast(skb, net, NETLINK_CB(skb).pid);
- } else
+ } else {
ip_mr_forward(net, mrt, skb, c, 0);
+ }
}
}
@@ -867,6 +879,7 @@ static int ipmr_cache_report(struct mr_table *mrt,
const int ihl = ip_hdrlen(pkt);
struct igmphdr *igmp;
struct igmpmsg *msg;
+ struct sock *mroute_sk;
int ret;
#ifdef CONFIG_IP_PIMSM
@@ -882,9 +895,9 @@ static int ipmr_cache_report(struct mr_table *mrt,
#ifdef CONFIG_IP_PIMSM
if (assert == IGMPMSG_WHOLEPKT) {
/* Ugly, but we have no choice with this interface.
- Duplicate old header, fix ihl, length etc.
- And all this only to mangle msg->im_msgtype and
- to set msg->im_mbz to "mbz" :-)
+ * Duplicate old header, fix ihl, length etc.
+ * And all this only to mangle msg->im_msgtype and
+ * to set msg->im_mbz to "mbz" :-)
*/
skb_push(skb, sizeof(struct iphdr));
skb_reset_network_header(skb);
@@ -901,39 +914,38 @@ static int ipmr_cache_report(struct mr_table *mrt,
#endif
{
- /*
- * Copy the IP header
- */
+ /* Copy the IP header */
skb->network_header = skb->tail;
skb_put(skb, ihl);
skb_copy_to_linear_data(skb, pkt->data, ihl);
- ip_hdr(skb)->protocol = 0; /* Flag to the kernel this is a route add */
+ ip_hdr(skb)->protocol = 0; /* Flag to the kernel this is a route add */
msg = (struct igmpmsg *)skb_network_header(skb);
msg->im_vif = vifi;
skb_dst_set(skb, dst_clone(skb_dst(pkt)));
- /*
- * Add our header
- */
+ /* Add our header */
- igmp=(struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
+ igmp = (struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
igmp->type =
msg->im_msgtype = assert;
- igmp->code = 0;
- ip_hdr(skb)->tot_len = htons(skb->len); /* Fix the length */
+ igmp->code = 0;
+ ip_hdr(skb)->tot_len = htons(skb->len); /* Fix the length */
skb->transport_header = skb->network_header;
}
- if (mrt->mroute_sk == NULL) {
+ rcu_read_lock();
+ mroute_sk = rcu_dereference(mrt->mroute_sk);
+ if (mroute_sk == NULL) {
+ rcu_read_unlock();
kfree_skb(skb);
return -EINVAL;
}
- /*
- * Deliver to mrouted
- */
- ret = sock_queue_rcv_skb(mrt->mroute_sk, skb);
+ /* Deliver to mrouted */
+
+ ret = sock_queue_rcv_skb(mroute_sk, skb);
+ rcu_read_unlock();
if (ret < 0) {
if (net_ratelimit())
printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
@@ -965,9 +977,7 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb)
}
if (!found) {
- /*
- * Create a new entry if allowable
- */
+ /* Create a new entry if allowable */
if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 ||
(c = ipmr_cache_alloc_unres()) == NULL) {
@@ -977,16 +987,14 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb)
return -ENOBUFS;
}
- /*
- * Fill in the new cache entry
- */
+ /* Fill in the new cache entry */
+
c->mfc_parent = -1;
c->mfc_origin = iph->saddr;
c->mfc_mcastgrp = iph->daddr;
- /*
- * Reflect first query at mrouted.
- */
+ /* Reflect first query at mrouted. */
+
err = ipmr_cache_report(mrt, skb, vifi, IGMPMSG_NOCACHE);
if (err < 0) {
/* If the report failed throw the cache entry
@@ -1006,10 +1014,9 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb)
mod_timer(&mrt->ipmr_expire_timer, c->mfc_un.unres.expires);
}
- /*
- * See if we can append the packet
- */
- if (c->mfc_un.unres.unresolved.qlen>3) {
+ /* See if we can append the packet */
+
+ if (c->mfc_un.unres.unresolved.qlen > 3) {
kfree_skb(skb);
err = -ENOBUFS;
} else {
@@ -1035,9 +1042,7 @@ static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc)
list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[line], list) {
if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
- write_lock_bh(&mrt_lock);
- list_del(&c->list);
- write_unlock_bh(&mrt_lock);
+ list_del_rcu(&c->list);
ipmr_cache_free(c);
return 0;
@@ -1090,9 +1095,7 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
if (!mrtsock)
c->mfc_flags |= MFC_STATIC;
- write_lock_bh(&mrt_lock);
- list_add(&c->list, &mrt->mfc_cache_array[line]);
- write_unlock_bh(&mrt_lock);
+ list_add_rcu(&c->list, &mrt->mfc_cache_array[line]);
/*
* Check to see if we resolved a queued list. If so we
@@ -1130,26 +1133,21 @@ static void mroute_clean_tables(struct mr_table *mrt)
LIST_HEAD(list);
struct mfc_cache *c, *next;
- /*
- * Shut down all active vif entries
- */
+ /* Shut down all active vif entries */
+
for (i = 0; i < mrt->maxvif; i++) {
- if (!(mrt->vif_table[i].flags&VIFF_STATIC))
+ if (!(mrt->vif_table[i].flags & VIFF_STATIC))
vif_delete(mrt, i, 0, &list);
}
unregister_netdevice_many(&list);
- /*
- * Wipe the cache
- */
+ /* Wipe the cache */
+
for (i = 0; i < MFC_LINES; i++) {
list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[i], list) {
- if (c->mfc_flags&MFC_STATIC)
+ if (c->mfc_flags & MFC_STATIC)
continue;
- write_lock_bh(&mrt_lock);
- list_del(&c->list);
- write_unlock_bh(&mrt_lock);
-
+ list_del_rcu(&c->list);
ipmr_cache_free(c);
}
}
@@ -1164,6 +1162,9 @@ static void mroute_clean_tables(struct mr_table *mrt)
}
}
+/* called from ip_ra_control(), before an RCU grace period,
+ * we dont need to call synchronize_rcu() here
+ */
static void mrtsock_destruct(struct sock *sk)
{
struct net *net = sock_net(sk);
@@ -1171,13 +1172,9 @@ static void mrtsock_destruct(struct sock *sk)
rtnl_lock();
ipmr_for_each_table(mrt, net) {
- if (sk == mrt->mroute_sk) {
+ if (sk == rtnl_dereference(mrt->mroute_sk)) {
IPV4_DEVCONF_ALL(net, MC_FORWARDING)--;
-
- write_lock_bh(&mrt_lock);
- mrt->mroute_sk = NULL;
- write_unlock_bh(&mrt_lock);
-
+ rcu_assign_pointer(mrt->mroute_sk, NULL);
mroute_clean_tables(mrt);
}
}
@@ -1204,7 +1201,8 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
return -ENOENT;
if (optname != MRT_INIT) {
- if (sk != mrt->mroute_sk && !capable(CAP_NET_ADMIN))
+ if (sk != rcu_dereference_raw(mrt->mroute_sk) &&
+ !capable(CAP_NET_ADMIN))
return -EACCES;
}
@@ -1217,23 +1215,20 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
return -ENOPROTOOPT;
rtnl_lock();
- if (mrt->mroute_sk) {
+ if (rtnl_dereference(mrt->mroute_sk)) {
rtnl_unlock();
return -EADDRINUSE;
}
ret = ip_ra_control(sk, 1, mrtsock_destruct);
if (ret == 0) {
- write_lock_bh(&mrt_lock);
- mrt->mroute_sk = sk;
- write_unlock_bh(&mrt_lock);
-
+ rcu_assign_pointer(mrt->mroute_sk, sk);
IPV4_DEVCONF_ALL(net, MC_FORWARDING)++;
}
rtnl_unlock();
return ret;
case MRT_DONE:
- if (sk != mrt->mroute_sk)
+ if (sk != rcu_dereference_raw(mrt->mroute_sk))
return -EACCES;
return ip_ra_control(sk, 0, NULL);
case MRT_ADD_VIF:
@@ -1246,7 +1241,8 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
return -ENFILE;
rtnl_lock();
if (optname == MRT_ADD_VIF) {
- ret = vif_add(net, mrt, &vif, sk == mrt->mroute_sk);
+ ret = vif_add(net, mrt, &vif,
+ sk == rtnl_dereference(mrt->mroute_sk));
} else {
ret = vif_delete(mrt, vif.vifc_vifi, 0, NULL);
}
@@ -1267,7 +1263,8 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
if (optname == MRT_DEL_MFC)
ret = ipmr_mfc_delete(mrt, &mfc);
else
- ret = ipmr_mfc_add(net, mrt, &mfc, sk == mrt->mroute_sk);
+ ret = ipmr_mfc_add(net, mrt, &mfc,
+ sk == rtnl_dereference(mrt->mroute_sk));
rtnl_unlock();
return ret;
/*
@@ -1276,7 +1273,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
case MRT_ASSERT:
{
int v;
- if (get_user(v,(int __user *)optval))
+ if (get_user(v, (int __user *)optval))
return -EFAULT;
mrt->mroute_do_assert = (v) ? 1 : 0;
return 0;
@@ -1286,7 +1283,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
{
int v;
- if (get_user(v,(int __user *)optval))
+ if (get_user(v, (int __user *)optval))
return -EFAULT;
v = (v) ? 1 : 0;
@@ -1309,14 +1306,16 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
return -EINVAL;
if (get_user(v, (u32 __user *)optval))
return -EFAULT;
- if (sk == mrt->mroute_sk)
- return -EBUSY;
rtnl_lock();
ret = 0;
- if (!ipmr_new_table(net, v))
- ret = -ENOMEM;
- raw_sk(sk)->ipmr_table = v;
+ if (sk == rtnl_dereference(mrt->mroute_sk)) {
+ ret = -EBUSY;
+ } else {
+ if (!ipmr_new_table(net, v))
+ ret = -ENOMEM;
+ raw_sk(sk)->ipmr_table = v;
+ }
rtnl_unlock();
return ret;
}
@@ -1347,9 +1346,9 @@ int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int
if (optname != MRT_VERSION &&
#ifdef CONFIG_IP_PIMSM
- optname!=MRT_PIM &&
+ optname != MRT_PIM &&
#endif
- optname!=MRT_ASSERT)
+ optname != MRT_ASSERT)
return -ENOPROTOOPT;
if (get_user(olr, optlen))
@@ -1416,19 +1415,19 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
if (copy_from_user(&sr, arg, sizeof(sr)))
return -EFAULT;
- read_lock(&mrt_lock);
+ rcu_read_lock();
c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr);
if (c) {
sr.pktcnt = c->mfc_un.res.pkt;
sr.bytecnt = c->mfc_un.res.bytes;
sr.wrong_if = c->mfc_un.res.wrong_if;
- read_unlock(&mrt_lock);
+ rcu_read_unlock();
if (copy_to_user(arg, &sr, sizeof(sr)))
return -EFAULT;
return 0;
}
- read_unlock(&mrt_lock);
+ rcu_read_unlock();
return -EADDRNOTAVAIL;
default:
return -ENOIOCTLCMD;
@@ -1465,7 +1464,7 @@ static struct notifier_block ip_mr_notifier = {
};
/*
- * Encapsulate a packet by attaching a valid IPIP header to it.
+ * Encapsulate a packet by attaching a valid IPIP header to it.
* This avoids tunnel drivers and other mess and gives us the speed so
* important for multicast video.
*/
@@ -1480,7 +1479,7 @@ static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
skb_reset_network_header(skb);
iph = ip_hdr(skb);
- iph->version = 4;
+ iph->version = 4;
iph->tos = old_iph->tos;
iph->ttl = old_iph->ttl;
iph->frag_off = 0;
@@ -1498,7 +1497,7 @@ static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
static inline int ipmr_forward_finish(struct sk_buff *skb)
{
- struct ip_options * opt = &(IPCB(skb)->opt);
+ struct ip_options *opt = &(IPCB(skb)->opt);
IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
@@ -1535,32 +1534,44 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
}
#endif
- if (vif->flags&VIFF_TUNNEL) {
- struct flowi fl = { .oif = vif->link,
- .nl_u = { .ip4_u =
- { .daddr = vif->remote,
- .saddr = vif->local,
- .tos = RT_TOS(iph->tos) } },
- .proto = IPPROTO_IPIP };
+ if (vif->flags & VIFF_TUNNEL) {
+ struct flowi fl = {
+ .oif = vif->link,
+ .nl_u = {
+ .ip4_u = {
+ .daddr = vif->remote,
+ .saddr = vif->local,
+ .tos = RT_TOS(iph->tos)
+ }
+ },
+ .proto = IPPROTO_IPIP
+ };
+
if (ip_route_output_key(net, &rt, &fl))
goto out_free;
encap = sizeof(struct iphdr);
} else {
- struct flowi fl = { .oif = vif->link,
- .nl_u = { .ip4_u =
- { .daddr = iph->daddr,
- .tos = RT_TOS(iph->tos) } },
- .proto = IPPROTO_IPIP };
+ struct flowi fl = {
+ .oif = vif->link,
+ .nl_u = {
+ .ip4_u = {
+ .daddr = iph->daddr,
+ .tos = RT_TOS(iph->tos)
+ }
+ },
+ .proto = IPPROTO_IPIP
+ };
+
if (ip_route_output_key(net, &rt, &fl))
goto out_free;
}
- dev = rt->u.dst.dev;
+ dev = rt->dst.dev;
- if (skb->len+encap > dst_mtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) {
+ if (skb->len+encap > dst_mtu(&rt->dst) && (ntohs(iph->frag_off) & IP_DF)) {
/* Do not fragment multicasts. Alas, IPv4 does not
- allow to send ICMP, so that packets will disappear
- to blackhole.
+ * allow to send ICMP, so that packets will disappear
+ * to blackhole.
*/
IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
@@ -1568,7 +1579,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
goto out_free;
}
- encap += LL_RESERVED_SPACE(dev) + rt->u.dst.header_len;
+ encap += LL_RESERVED_SPACE(dev) + rt->dst.header_len;
if (skb_cow(skb, encap)) {
ip_rt_put(rt);
@@ -1579,11 +1590,12 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
vif->bytes_out += skb->len;
skb_dst_drop(skb);
- skb_dst_set(skb, &rt->u.dst);
+ skb_dst_set(skb, &rt->dst);
ip_decrease_ttl(ip_hdr(skb));
/* FIXME: forward and output firewalls used to be called here.
- * What do we do with netfilter? -- RR */
+ * What do we do with netfilter? -- RR
+ */
if (vif->flags & VIFF_TUNNEL) {
ip_encap(skb, vif->local, vif->remote);
/* FIXME: extra output firewall step used to be here. --RR */
@@ -1644,15 +1656,15 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt,
if (skb_rtable(skb)->fl.iif == 0) {
/* It is our own packet, looped back.
- Very complicated situation...
-
- The best workaround until routing daemons will be
- fixed is not to redistribute packet, if it was
- send through wrong interface. It means, that
- multicast applications WILL NOT work for
- (S,G), which have default multicast route pointing
- to wrong oif. In any case, it is not a good
- idea to use multicasting applications on router.
+ * Very complicated situation...
+ *
+ * The best workaround until routing daemons will be
+ * fixed is not to redistribute packet, if it was
+ * send through wrong interface. It means, that
+ * multicast applications WILL NOT work for
+ * (S,G), which have default multicast route pointing
+ * to wrong oif. In any case, it is not a good
+ * idea to use multicasting applications on router.
*/
goto dont_forward;
}
@@ -1662,9 +1674,9 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt,
if (true_vifi >= 0 && mrt->mroute_do_assert &&
/* pimsm uses asserts, when switching from RPT to SPT,
- so that we cannot check that packet arrived on an oif.
- It is bad, but otherwise we would need to move pretty
- large chunk of pimd to kernel. Ough... --ANK
+ * so that we cannot check that packet arrived on an oif.
+ * It is bad, but otherwise we would need to move pretty
+ * large chunk of pimd to kernel. Ough... --ANK
*/
(mrt->mroute_do_pim ||
cache->mfc_un.res.ttls[true_vifi] < 255) &&
@@ -1682,10 +1694,12 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt,
/*
* Forward the frame
*/
- for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) {
+ for (ct = cache->mfc_un.res.maxvif - 1;
+ ct >= cache->mfc_un.res.minvif; ct--) {
if (ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) {
if (psend != -1) {
struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
+
if (skb2)
ipmr_queue_xmit(net, mrt, skb2, cache,
psend);
@@ -1696,6 +1710,7 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt,
if (psend != -1) {
if (local) {
struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
+
if (skb2)
ipmr_queue_xmit(net, mrt, skb2, cache, psend);
} else {
@@ -1713,6 +1728,7 @@ dont_forward:
/*
* Multicast packets for forwarding arrive here
+ * Called with rcu_read_lock();
*/
int ip_mr_input(struct sk_buff *skb)
@@ -1724,9 +1740,9 @@ int ip_mr_input(struct sk_buff *skb)
int err;
/* Packet is looped back after forward, it should not be
- forwarded second time, but still can be delivered locally.
+ * forwarded second time, but still can be delivered locally.
*/
- if (IPCB(skb)->flags&IPSKB_FORWARDED)
+ if (IPCB(skb)->flags & IPSKB_FORWARDED)
goto dont_forward;
err = ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt);
@@ -1736,28 +1752,28 @@ int ip_mr_input(struct sk_buff *skb)
}
if (!local) {
- if (IPCB(skb)->opt.router_alert) {
- if (ip_call_ra_chain(skb))
- return 0;
- } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP){
- /* IGMPv1 (and broken IGMPv2 implementations sort of
- Cisco IOS <= 11.2(8)) do not put router alert
- option to IGMP packets destined to routable
- groups. It is very bad, because it means
- that we can forward NO IGMP messages.
- */
- read_lock(&mrt_lock);
- if (mrt->mroute_sk) {
- nf_reset(skb);
- raw_rcv(mrt->mroute_sk, skb);
- read_unlock(&mrt_lock);
- return 0;
- }
- read_unlock(&mrt_lock);
+ if (IPCB(skb)->opt.router_alert) {
+ if (ip_call_ra_chain(skb))
+ return 0;
+ } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP) {
+ /* IGMPv1 (and broken IGMPv2 implementations sort of
+ * Cisco IOS <= 11.2(8)) do not put router alert
+ * option to IGMP packets destined to routable
+ * groups. It is very bad, because it means
+ * that we can forward NO IGMP messages.
+ */
+ struct sock *mroute_sk;
+
+ mroute_sk = rcu_dereference(mrt->mroute_sk);
+ if (mroute_sk) {
+ nf_reset(skb);
+ raw_rcv(mroute_sk, skb);
+ return 0;
+ }
}
}
- read_lock(&mrt_lock);
+ /* already under rcu_read_lock() */
cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
/*
@@ -1769,13 +1785,12 @@ int ip_mr_input(struct sk_buff *skb)
if (local) {
struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
ip_local_deliver(skb);
- if (skb2 == NULL) {
- read_unlock(&mrt_lock);
+ if (skb2 == NULL)
return -ENOBUFS;
- }
skb = skb2;
}
+ read_lock(&mrt_lock);
vif = ipmr_find_vif(mrt, skb->dev);
if (vif >= 0) {
int err2 = ipmr_cache_unresolved(mrt, vif, skb);
@@ -1788,8 +1803,8 @@ int ip_mr_input(struct sk_buff *skb)
return -ENODEV;
}
+ read_lock(&mrt_lock);
ip_mr_forward(net, mrt, skb, cache, local);
-
read_unlock(&mrt_lock);
if (local)
@@ -1805,6 +1820,7 @@ dont_forward:
}
#ifdef CONFIG_IP_PIMSM
+/* called with rcu_read_lock() */
static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
unsigned int pimlen)
{
@@ -1813,10 +1829,10 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
encap = (struct iphdr *)(skb_transport_header(skb) + pimlen);
/*
- Check that:
- a. packet is really destinted to a multicast group
- b. packet is not a NULL-REGISTER
- c. packet is not truncated
+ * Check that:
+ * a. packet is really sent to a multicast group
+ * b. packet is not a NULL-REGISTER
+ * c. packet is not truncated
*/
if (!ipv4_is_multicast(encap->daddr) ||
encap->tot_len == 0 ||
@@ -1826,26 +1842,23 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
read_lock(&mrt_lock);
if (mrt->mroute_reg_vif_num >= 0)
reg_dev = mrt->vif_table[mrt->mroute_reg_vif_num].dev;
- if (reg_dev)
- dev_hold(reg_dev);
read_unlock(&mrt_lock);
if (reg_dev == NULL)
return 1;
skb->mac_header = skb->network_header;
- skb_pull(skb, (u8*)encap - skb->data);
+ skb_pull(skb, (u8 *)encap - skb->data);
skb_reset_network_header(skb);
skb->protocol = htons(ETH_P_IP);
- skb->ip_summed = 0;
+ skb->ip_summed = CHECKSUM_NONE;
skb->pkt_type = PACKET_HOST;
skb_tunnel_rx(skb, reg_dev);
netif_rx(skb);
- dev_put(reg_dev);
- return 0;
+ return NET_RX_SUCCESS;
}
#endif
@@ -1854,7 +1867,7 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
* Handle IGMP messages of PIMv1
*/
-int pim_rcv_v1(struct sk_buff * skb)
+int pim_rcv_v1(struct sk_buff *skb)
{
struct igmphdr *pim;
struct net *net = dev_net(skb->dev);
@@ -1881,7 +1894,7 @@ drop:
#endif
#ifdef CONFIG_IP_PIMSM_V2
-static int pim_rcv(struct sk_buff * skb)
+static int pim_rcv(struct sk_buff *skb)
{
struct pimreghdr *pim;
struct net *net = dev_net(skb->dev);
@@ -1891,8 +1904,8 @@ static int pim_rcv(struct sk_buff * skb)
goto drop;
pim = (struct pimreghdr *)skb_transport_header(skb);
- if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) ||
- (pim->flags&PIM_NULL_REGISTER) ||
+ if (pim->type != ((PIM_VERSION << 4) | (PIM_REGISTER)) ||
+ (pim->flags & PIM_NULL_REGISTER) ||
(ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
csum_fold(skb_checksum(skb, 0, skb->len, 0))))
goto drop;
@@ -1958,28 +1971,33 @@ int ipmr_get_route(struct net *net,
if (mrt == NULL)
return -ENOENT;
- read_lock(&mrt_lock);
+ rcu_read_lock();
cache = ipmr_cache_find(mrt, rt->rt_src, rt->rt_dst);
if (cache == NULL) {
struct sk_buff *skb2;
struct iphdr *iph;
struct net_device *dev;
- int vif;
+ int vif = -1;
if (nowait) {
- read_unlock(&mrt_lock);
+ rcu_read_unlock();
return -EAGAIN;
}
dev = skb->dev;
- if (dev == NULL || (vif = ipmr_find_vif(mrt, dev)) < 0) {
+ read_lock(&mrt_lock);
+ if (dev)
+ vif = ipmr_find_vif(mrt, dev);
+ if (vif < 0) {
read_unlock(&mrt_lock);
+ rcu_read_unlock();
return -ENODEV;
}
skb2 = skb_clone(skb, GFP_ATOMIC);
if (!skb2) {
read_unlock(&mrt_lock);
+ rcu_read_unlock();
return -ENOMEM;
}
@@ -1992,13 +2010,16 @@ int ipmr_get_route(struct net *net,
iph->version = 0;
err = ipmr_cache_unresolved(mrt, vif, skb2);
read_unlock(&mrt_lock);
+ rcu_read_unlock();
return err;
}
- if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
+ read_lock(&mrt_lock);
+ if (!nowait && (rtm->rtm_flags & RTM_F_NOTIFY))
cache->mfc_flags |= MFC_NOTIFY;
err = __ipmr_fill_mroute(mrt, skb, cache, rtm);
read_unlock(&mrt_lock);
+ rcu_read_unlock();
return err;
}
@@ -2050,14 +2071,14 @@ static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
s_h = cb->args[1];
s_e = cb->args[2];
- read_lock(&mrt_lock);
+ rcu_read_lock();
ipmr_for_each_table(mrt, net) {
if (t < s_t)
goto next_table;
if (t > s_t)
s_h = 0;
for (h = s_h; h < MFC_LINES; h++) {
- list_for_each_entry(mfc, &mrt->mfc_cache_array[h], list) {
+ list_for_each_entry_rcu(mfc, &mrt->mfc_cache_array[h], list) {
if (e < s_e)
goto next_entry;
if (ipmr_fill_mroute(mrt, skb,
@@ -2075,7 +2096,7 @@ next_table:
t++;
}
done:
- read_unlock(&mrt_lock);
+ rcu_read_unlock();
cb->args[2] = e;
cb->args[1] = h;
@@ -2086,7 +2107,8 @@ done:
#ifdef CONFIG_PROC_FS
/*
- * The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif
+ * The /proc interfaces to multicast routing :
+ * /proc/net/ip_mr_cache & /proc/net/ip_mr_vif
*/
struct ipmr_vif_iter {
struct seq_net_private p;
@@ -2208,14 +2230,14 @@ static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net,
struct mr_table *mrt = it->mrt;
struct mfc_cache *mfc;
- read_lock(&mrt_lock);
+ rcu_read_lock();
for (it->ct = 0; it->ct < MFC_LINES; it->ct++) {
it->cache = &mrt->mfc_cache_array[it->ct];
- list_for_each_entry(mfc, it->cache, list)
+ list_for_each_entry_rcu(mfc, it->cache, list)
if (pos-- == 0)
return mfc;
}
- read_unlock(&mrt_lock);
+ rcu_read_unlock();
spin_lock_bh(&mfc_unres_lock);
it->cache = &mrt->mfc_unres_queue;
@@ -2274,7 +2296,7 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
}
/* exhausted cache_array, show unresolved */
- read_unlock(&mrt_lock);
+ rcu_read_unlock();
it->cache = &mrt->mfc_unres_queue;
it->ct = 0;
@@ -2282,7 +2304,7 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
if (!list_empty(it->cache))
return list_first_entry(it->cache, struct mfc_cache, list);
- end_of_list:
+end_of_list:
spin_unlock_bh(&mfc_unres_lock);
it->cache = NULL;
@@ -2297,7 +2319,7 @@ static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
if (it->cache == &mrt->mfc_unres_queue)
spin_unlock_bh(&mfc_unres_lock);
else if (it->cache == &mrt->mfc_cache_array[it->ct])
- read_unlock(&mrt_lock);
+ rcu_read_unlock();
}
static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
@@ -2323,7 +2345,7 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
mfc->mfc_un.res.bytes,
mfc->mfc_un.res.wrong_if);
for (n = mfc->mfc_un.res.minvif;
- n < mfc->mfc_un.res.maxvif; n++ ) {
+ n < mfc->mfc_un.res.maxvif; n++) {
if (VIF_EXISTS(mrt, n) &&
mfc->mfc_un.res.ttls[n] < 255)
seq_printf(seq,
@@ -2421,7 +2443,7 @@ int __init ip_mr_init(void)
mrt_cachep = kmem_cache_create("ip_mrt_cache",
sizeof(struct mfc_cache),
- 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
+ 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC,
NULL);
if (!mrt_cachep)
return -ENOMEM;
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index 07de855e2175..d88a46c54fd1 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -43,7 +43,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
/* Drop old route. */
skb_dst_drop(skb);
- skb_dst_set(skb, &rt->u.dst);
+ skb_dst_set(skb, &rt->dst);
} else {
/* non-local src, find valid iif to satisfy
* rp-filter when calling ip_route_input. */
@@ -53,11 +53,11 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
orefdst = skb->_skb_refdst;
if (ip_route_input(skb, iph->daddr, iph->saddr,
- RT_TOS(iph->tos), rt->u.dst.dev) != 0) {
- dst_release(&rt->u.dst);
+ RT_TOS(iph->tos), rt->dst.dev) != 0) {
+ dst_release(&rt->dst);
return -1;
}
- dst_release(&rt->u.dst);
+ dst_release(&rt->dst);
refdst_drop(orefdst);
}
@@ -212,9 +212,7 @@ static __sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook,
skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, protocol,
skb->len - dataoff, 0);
skb->ip_summed = CHECKSUM_NONE;
- csum = __skb_checksum_complete_head(skb, dataoff + len);
- if (!csum)
- skb->ip_summed = CHECKSUM_UNNECESSARY;
+ return __skb_checksum_complete_head(skb, dataoff + len);
}
return csum;
}
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 1833bdbf9805..babd1a2bae5f 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -147,7 +147,7 @@ config IP_NF_TARGET_ULOG
which can only be viewed through syslog.
The appropriate userspace logging daemon (ulogd) may be obtained from
- <http://www.gnumonks.org/projects/ulogd/>
+ <http://www.netfilter.org/projects/ulogd/index.html>
To compile it as a module, choose M here. If unsure, say N.
@@ -324,10 +324,10 @@ config IP_NF_TARGET_ECN
config IP_NF_TARGET_TTL
tristate '"TTL" target support'
- depends on NETFILTER_ADVANCED
+ depends on NETFILTER_ADVANCED && IP_NF_MANGLE
select NETFILTER_XT_TARGET_HL
---help---
- This is a backwards-compat option for the user's convenience
+ This is a backwards-compatible option for the user's convenience
(e.g. when running oldconfig). It selects
CONFIG_NETFILTER_XT_TARGET_HL.
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 1ac01b128621..3cad2591ace0 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -72,7 +72,7 @@ static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap,
for (i = 0; i < len; i++)
ret |= (hdr_addr[i] ^ ap->addr[i]) & ap->mask[i];
- return (ret != 0);
+ return ret != 0;
}
/*
@@ -228,7 +228,7 @@ arpt_error(struct sk_buff *skb, const struct xt_action_param *par)
return NF_DROP;
}
-static inline const struct arpt_entry_target *
+static inline const struct xt_entry_target *
arpt_get_target_c(const struct arpt_entry *e)
{
return arpt_get_target((struct arpt_entry *)e);
@@ -282,17 +282,14 @@ unsigned int arpt_do_table(struct sk_buff *skb,
arp = arp_hdr(skb);
do {
- const struct arpt_entry_target *t;
- int hdr_len;
+ const struct xt_entry_target *t;
if (!arp_packet_match(arp, skb->dev, indev, outdev, &e->arp)) {
e = arpt_next_entry(e);
continue;
}
- hdr_len = sizeof(*arp) + (2 * sizeof(struct in_addr)) +
- (2 * skb->dev->addr_len);
- ADD_COUNTER(e->counters, hdr_len, 1);
+ ADD_COUNTER(e->counters, arp_hdr_len(skb->dev), 1);
t = arpt_get_target_c(e);
@@ -300,10 +297,10 @@ unsigned int arpt_do_table(struct sk_buff *skb,
if (!t->u.kernel.target->target) {
int v;
- v = ((struct arpt_standard_target *)t)->verdict;
+ v = ((struct xt_standard_target *)t)->verdict;
if (v < 0) {
/* Pop from stack? */
- if (v != ARPT_RETURN) {
+ if (v != XT_RETURN) {
verdict = (unsigned)(-v) - 1;
break;
}
@@ -335,7 +332,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
/* Target might have changed stuff. */
arp = arp_hdr(skb);
- if (verdict == ARPT_CONTINUE)
+ if (verdict == XT_CONTINUE)
e = arpt_next_entry(e);
else
/* Verdict */
@@ -380,7 +377,7 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
e->counters.pcnt = pos;
for (;;) {
- const struct arpt_standard_target *t
+ const struct xt_standard_target *t
= (void *)arpt_get_target_c(e);
int visited = e->comefrom & (1 << hook);
@@ -395,13 +392,13 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
/* Unconditional return/END. */
if ((e->target_offset == sizeof(struct arpt_entry) &&
(strcmp(t->target.u.user.name,
- ARPT_STANDARD_TARGET) == 0) &&
+ XT_STANDARD_TARGET) == 0) &&
t->verdict < 0 && unconditional(&e->arp)) ||
visited) {
unsigned int oldpos, size;
if ((strcmp(t->target.u.user.name,
- ARPT_STANDARD_TARGET) == 0) &&
+ XT_STANDARD_TARGET) == 0) &&
t->verdict < -NF_MAX_VERDICT - 1) {
duprintf("mark_source_chains: bad "
"negative verdict (%i)\n",
@@ -436,7 +433,7 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
int newpos = t->verdict;
if (strcmp(t->target.u.user.name,
- ARPT_STANDARD_TARGET) == 0 &&
+ XT_STANDARD_TARGET) == 0 &&
newpos >= 0) {
if (newpos > newinfo->size -
sizeof(struct arpt_entry)) {
@@ -467,14 +464,14 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
static inline int check_entry(const struct arpt_entry *e, const char *name)
{
- const struct arpt_entry_target *t;
+ const struct xt_entry_target *t;
if (!arp_checkentry(&e->arp)) {
duprintf("arp_tables: arp check failed %p %s.\n", e, name);
return -EINVAL;
}
- if (e->target_offset + sizeof(struct arpt_entry_target) > e->next_offset)
+ if (e->target_offset + sizeof(struct xt_entry_target) > e->next_offset)
return -EINVAL;
t = arpt_get_target_c(e);
@@ -486,7 +483,7 @@ static inline int check_entry(const struct arpt_entry *e, const char *name)
static inline int check_target(struct arpt_entry *e, const char *name)
{
- struct arpt_entry_target *t = arpt_get_target(e);
+ struct xt_entry_target *t = arpt_get_target(e);
int ret;
struct xt_tgchk_param par = {
.table = name,
@@ -509,7 +506,7 @@ static inline int check_target(struct arpt_entry *e, const char *name)
static inline int
find_check_entry(struct arpt_entry *e, const char *name, unsigned int size)
{
- struct arpt_entry_target *t;
+ struct xt_entry_target *t;
struct xt_target *target;
int ret;
@@ -539,7 +536,7 @@ out:
static bool check_underflow(const struct arpt_entry *e)
{
- const struct arpt_entry_target *t;
+ const struct xt_entry_target *t;
unsigned int verdict;
if (!unconditional(&e->arp))
@@ -547,7 +544,7 @@ static bool check_underflow(const struct arpt_entry *e)
t = arpt_get_target_c(e);
if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0)
return false;
- verdict = ((struct arpt_standard_target *)t)->verdict;
+ verdict = ((struct xt_standard_target *)t)->verdict;
verdict = -verdict - 1;
return verdict == NF_DROP || verdict == NF_ACCEPT;
}
@@ -569,7 +566,7 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e,
}
if (e->next_offset
- < sizeof(struct arpt_entry) + sizeof(struct arpt_entry_target)) {
+ < sizeof(struct arpt_entry) + sizeof(struct xt_entry_target)) {
duprintf("checking: element %p size %u\n",
e, e->next_offset);
return -EINVAL;
@@ -601,7 +598,7 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e,
static inline void cleanup_entry(struct arpt_entry *e)
{
struct xt_tgdtor_param par;
- struct arpt_entry_target *t;
+ struct xt_entry_target *t;
t = arpt_get_target(e);
par.target = t->u.kernel.target;
@@ -713,7 +710,7 @@ static void get_counters(const struct xt_table_info *t,
struct arpt_entry *iter;
unsigned int cpu;
unsigned int i;
- unsigned int curcpu;
+ unsigned int curcpu = get_cpu();
/* Instead of clearing (by a previous call to memset())
* the counters and using adds, we set the counters
@@ -723,19 +720,22 @@ static void get_counters(const struct xt_table_info *t,
* if new softirq were to run and call ipt_do_table
*/
local_bh_disable();
- curcpu = smp_processor_id();
-
i = 0;
xt_entry_foreach(iter, t->entries[curcpu], t->size) {
SET_COUNTER(counters[i], iter->counters.bcnt,
iter->counters.pcnt);
++i;
}
+ local_bh_enable();
+ /* Processing counters from other cpus, we can let bottom half enabled,
+ * (preemption is disabled)
+ */
for_each_possible_cpu(cpu) {
if (cpu == curcpu)
continue;
i = 0;
+ local_bh_disable();
xt_info_wrlock(cpu);
xt_entry_foreach(iter, t->entries[cpu], t->size) {
ADD_COUNTER(counters[i], iter->counters.bcnt,
@@ -743,8 +743,9 @@ static void get_counters(const struct xt_table_info *t,
++i;
}
xt_info_wrunlock(cpu);
+ local_bh_enable();
}
- local_bh_enable();
+ put_cpu();
}
static struct xt_counters *alloc_counters(const struct xt_table *table)
@@ -758,7 +759,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table)
* about).
*/
countersize = sizeof(struct xt_counters) * private->number;
- counters = vmalloc_node(countersize, numa_node_id());
+ counters = vmalloc(countersize);
if (counters == NULL)
return ERR_PTR(-ENOMEM);
@@ -793,7 +794,7 @@ static int copy_entries_to_user(unsigned int total_size,
/* FIXME: use iterator macros --RR */
/* ... then go back and fix counters and names */
for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){
- const struct arpt_entry_target *t;
+ const struct xt_entry_target *t;
e = (struct arpt_entry *)(loc_cpu_entry + off);
if (copy_to_user(userptr + off
@@ -806,7 +807,7 @@ static int copy_entries_to_user(unsigned int total_size,
t = arpt_get_target_c(e);
if (copy_to_user(userptr + off + e->target_offset
- + offsetof(struct arpt_entry_target,
+ + offsetof(struct xt_entry_target,
u.user.name),
t->u.kernel.target->name,
strlen(t->u.kernel.target->name)+1) != 0) {
@@ -843,7 +844,7 @@ static int compat_calc_entry(const struct arpt_entry *e,
const struct xt_table_info *info,
const void *base, struct xt_table_info *newinfo)
{
- const struct arpt_entry_target *t;
+ const struct xt_entry_target *t;
unsigned int entry_offset;
int off, i, ret;
@@ -894,7 +895,7 @@ static int compat_table_info(const struct xt_table_info *info,
static int get_info(struct net *net, void __user *user,
const int *len, int compat)
{
- char name[ARPT_TABLE_MAXNAMELEN];
+ char name[XT_TABLE_MAXNAMELEN];
struct xt_table *t;
int ret;
@@ -907,7 +908,7 @@ static int get_info(struct net *net, void __user *user,
if (copy_from_user(name, user, sizeof(name)) != 0)
return -EFAULT;
- name[ARPT_TABLE_MAXNAMELEN-1] = '\0';
+ name[XT_TABLE_MAXNAMELEN-1] = '\0';
#ifdef CONFIG_COMPAT
if (compat)
xt_compat_lock(NFPROTO_ARP);
@@ -1005,8 +1006,7 @@ static int __do_replace(struct net *net, const char *name,
struct arpt_entry *iter;
ret = 0;
- counters = vmalloc_node(num_counters * sizeof(struct xt_counters),
- numa_node_id());
+ counters = vmalloc(num_counters * sizeof(struct xt_counters));
if (!counters) {
ret = -ENOMEM;
goto out;
@@ -1159,7 +1159,7 @@ static int do_add_counters(struct net *net, const void __user *user,
if (len != size + num_counters * sizeof(struct xt_counters))
return -EINVAL;
- paddc = vmalloc_node(len - size, numa_node_id());
+ paddc = vmalloc(len - size);
if (!paddc)
return -ENOMEM;
@@ -1204,7 +1204,7 @@ static int do_add_counters(struct net *net, const void __user *user,
#ifdef CONFIG_COMPAT
static inline void compat_release_entry(struct compat_arpt_entry *e)
{
- struct arpt_entry_target *t;
+ struct xt_entry_target *t;
t = compat_arpt_get_target(e);
module_put(t->u.kernel.target->me);
@@ -1220,7 +1220,7 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e,
const unsigned int *underflows,
const char *name)
{
- struct arpt_entry_target *t;
+ struct xt_entry_target *t;
struct xt_target *target;
unsigned int entry_offset;
int ret, off, h;
@@ -1288,7 +1288,7 @@ compat_copy_entry_from_user(struct compat_arpt_entry *e, void **dstptr,
unsigned int *size, const char *name,
struct xt_table_info *newinfo, unsigned char *base)
{
- struct arpt_entry_target *t;
+ struct xt_entry_target *t;
struct xt_target *target;
struct arpt_entry *de;
unsigned int origsize;
@@ -1420,6 +1420,9 @@ static int translate_compat_table(const char *name,
if (ret != 0)
break;
++i;
+ if (strcmp(arpt_get_target(iter1)->u.user.name,
+ XT_ERROR_TARGET) == 0)
+ ++newinfo->stacksize;
}
if (ret) {
/*
@@ -1471,7 +1474,7 @@ out_unlock:
}
struct compat_arpt_replace {
- char name[ARPT_TABLE_MAXNAMELEN];
+ char name[XT_TABLE_MAXNAMELEN];
u32 valid_hooks;
u32 num_entries;
u32 size;
@@ -1564,7 +1567,7 @@ static int compat_copy_entry_to_user(struct arpt_entry *e, void __user **dstptr,
struct xt_counters *counters,
unsigned int i)
{
- struct arpt_entry_target *t;
+ struct xt_entry_target *t;
struct compat_arpt_entry __user *ce;
u_int16_t target_offset, next_offset;
compat_uint_t origsize;
@@ -1625,7 +1628,7 @@ static int compat_copy_entries_to_user(unsigned int total_size,
}
struct compat_arpt_get_entries {
- char name[ARPT_TABLE_MAXNAMELEN];
+ char name[XT_TABLE_MAXNAMELEN];
compat_uint_t size;
struct compat_arpt_entry entrytable[0];
};
@@ -1825,7 +1828,7 @@ void arpt_unregister_table(struct xt_table *table)
/* The built-in targets: standard (NULL) and error. */
static struct xt_target arpt_builtin_tg[] __read_mostly = {
{
- .name = ARPT_STANDARD_TARGET,
+ .name = XT_STANDARD_TARGET,
.targetsize = sizeof(int),
.family = NFPROTO_ARP,
#ifdef CONFIG_COMPAT
@@ -1835,9 +1838,9 @@ static struct xt_target arpt_builtin_tg[] __read_mostly = {
#endif
},
{
- .name = ARPT_ERROR_TARGET,
+ .name = XT_ERROR_TARGET,
.target = arpt_error,
- .targetsize = ARPT_FUNCTION_MAXNAMELEN,
+ .targetsize = XT_FUNCTION_MAXNAMELEN,
.family = NFPROTO_ARP,
},
};
diff --git a/net/ipv4/netfilter/arpt_mangle.c b/net/ipv4/netfilter/arpt_mangle.c
index e1be7dd1171b..b8ddcc480ed9 100644
--- a/net/ipv4/netfilter/arpt_mangle.c
+++ b/net/ipv4/netfilter/arpt_mangle.c
@@ -63,7 +63,7 @@ static int checkentry(const struct xt_tgchk_param *par)
return false;
if (mangle->target != NF_DROP && mangle->target != NF_ACCEPT &&
- mangle->target != ARPT_CONTINUE)
+ mangle->target != XT_CONTINUE)
return false;
return true;
}
diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c
index a4e5fc5df4bf..d2c1311cb28d 100644
--- a/net/ipv4/netfilter/ip_queue.c
+++ b/net/ipv4/netfilter/ip_queue.c
@@ -42,7 +42,7 @@ typedef int (*ipq_cmpfn)(struct nf_queue_entry *, unsigned long);
static unsigned char copy_mode __read_mostly = IPQ_COPY_NONE;
static unsigned int queue_maxlen __read_mostly = IPQ_QMAX_DEFAULT;
-static DEFINE_RWLOCK(queue_lock);
+static DEFINE_SPINLOCK(queue_lock);
static int peer_pid __read_mostly;
static unsigned int copy_range __read_mostly;
static unsigned int queue_total;
@@ -72,10 +72,10 @@ __ipq_set_mode(unsigned char mode, unsigned int range)
break;
case IPQ_COPY_PACKET:
- copy_mode = mode;
+ if (range > 0xFFFF)
+ range = 0xFFFF;
copy_range = range;
- if (copy_range > 0xFFFF)
- copy_range = 0xFFFF;
+ copy_mode = mode;
break;
default:
@@ -101,7 +101,7 @@ ipq_find_dequeue_entry(unsigned long id)
{
struct nf_queue_entry *entry = NULL, *i;
- write_lock_bh(&queue_lock);
+ spin_lock_bh(&queue_lock);
list_for_each_entry(i, &queue_list, list) {
if ((unsigned long)i == id) {
@@ -115,7 +115,7 @@ ipq_find_dequeue_entry(unsigned long id)
queue_total--;
}
- write_unlock_bh(&queue_lock);
+ spin_unlock_bh(&queue_lock);
return entry;
}
@@ -136,9 +136,9 @@ __ipq_flush(ipq_cmpfn cmpfn, unsigned long data)
static void
ipq_flush(ipq_cmpfn cmpfn, unsigned long data)
{
- write_lock_bh(&queue_lock);
+ spin_lock_bh(&queue_lock);
__ipq_flush(cmpfn, data);
- write_unlock_bh(&queue_lock);
+ spin_unlock_bh(&queue_lock);
}
static struct sk_buff *
@@ -152,9 +152,7 @@ ipq_build_packet_message(struct nf_queue_entry *entry, int *errp)
struct nlmsghdr *nlh;
struct timeval tv;
- read_lock_bh(&queue_lock);
-
- switch (copy_mode) {
+ switch (ACCESS_ONCE(copy_mode)) {
case IPQ_COPY_META:
case IPQ_COPY_NONE:
size = NLMSG_SPACE(sizeof(*pmsg));
@@ -162,26 +160,21 @@ ipq_build_packet_message(struct nf_queue_entry *entry, int *errp)
case IPQ_COPY_PACKET:
if (entry->skb->ip_summed == CHECKSUM_PARTIAL &&
- (*errp = skb_checksum_help(entry->skb))) {
- read_unlock_bh(&queue_lock);
+ (*errp = skb_checksum_help(entry->skb)))
return NULL;
- }
- if (copy_range == 0 || copy_range > entry->skb->len)
+
+ data_len = ACCESS_ONCE(copy_range);
+ if (data_len == 0 || data_len > entry->skb->len)
data_len = entry->skb->len;
- else
- data_len = copy_range;
size = NLMSG_SPACE(sizeof(*pmsg) + data_len);
break;
default:
*errp = -EINVAL;
- read_unlock_bh(&queue_lock);
return NULL;
}
- read_unlock_bh(&queue_lock);
-
skb = alloc_skb(size, GFP_ATOMIC);
if (!skb)
goto nlmsg_failure;
@@ -242,7 +235,7 @@ ipq_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
if (nskb == NULL)
return status;
- write_lock_bh(&queue_lock);
+ spin_lock_bh(&queue_lock);
if (!peer_pid)
goto err_out_free_nskb;
@@ -266,14 +259,14 @@ ipq_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
__ipq_enqueue_entry(entry);
- write_unlock_bh(&queue_lock);
+ spin_unlock_bh(&queue_lock);
return status;
err_out_free_nskb:
kfree_skb(nskb);
err_out_unlock:
- write_unlock_bh(&queue_lock);
+ spin_unlock_bh(&queue_lock);
return status;
}
@@ -342,9 +335,9 @@ ipq_set_mode(unsigned char mode, unsigned int range)
{
int status;
- write_lock_bh(&queue_lock);
+ spin_lock_bh(&queue_lock);
status = __ipq_set_mode(mode, range);
- write_unlock_bh(&queue_lock);
+ spin_unlock_bh(&queue_lock);
return status;
}
@@ -440,11 +433,11 @@ __ipq_rcv_skb(struct sk_buff *skb)
if (security_netlink_recv(skb, CAP_NET_ADMIN))
RCV_SKB_FAIL(-EPERM);
- write_lock_bh(&queue_lock);
+ spin_lock_bh(&queue_lock);
if (peer_pid) {
if (peer_pid != pid) {
- write_unlock_bh(&queue_lock);
+ spin_unlock_bh(&queue_lock);
RCV_SKB_FAIL(-EBUSY);
}
} else {
@@ -452,7 +445,7 @@ __ipq_rcv_skb(struct sk_buff *skb)
peer_pid = pid;
}
- write_unlock_bh(&queue_lock);
+ spin_unlock_bh(&queue_lock);
status = ipq_receive_peer(NLMSG_DATA(nlh), type,
nlmsglen - NLMSG_LENGTH(0));
@@ -497,10 +490,10 @@ ipq_rcv_nl_event(struct notifier_block *this,
struct netlink_notify *n = ptr;
if (event == NETLINK_URELEASE && n->protocol == NETLINK_FIREWALL) {
- write_lock_bh(&queue_lock);
+ spin_lock_bh(&queue_lock);
if ((net_eq(n->net, &init_net)) && (n->pid == peer_pid))
__ipq_reset();
- write_unlock_bh(&queue_lock);
+ spin_unlock_bh(&queue_lock);
}
return NOTIFY_DONE;
}
@@ -527,7 +520,7 @@ static ctl_table ipq_table[] = {
#ifdef CONFIG_PROC_FS
static int ip_queue_show(struct seq_file *m, void *v)
{
- read_lock_bh(&queue_lock);
+ spin_lock_bh(&queue_lock);
seq_printf(m,
"Peer PID : %d\n"
@@ -545,7 +538,7 @@ static int ip_queue_show(struct seq_file *m, void *v)
queue_dropped,
queue_user_dropped);
- read_unlock_bh(&queue_lock);
+ spin_unlock_bh(&queue_lock);
return 0;
}
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 4b6c5ca610fc..d31b007a6d80 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -186,7 +186,7 @@ static inline bool unconditional(const struct ipt_ip *ip)
}
/* for const-correctness */
-static inline const struct ipt_entry_target *
+static inline const struct xt_entry_target *
ipt_get_target_c(const struct ipt_entry *e)
{
return ipt_get_target((struct ipt_entry *)e);
@@ -230,9 +230,9 @@ get_chainname_rulenum(const struct ipt_entry *s, const struct ipt_entry *e,
const char *hookname, const char **chainname,
const char **comment, unsigned int *rulenum)
{
- const struct ipt_standard_target *t = (void *)ipt_get_target_c(s);
+ const struct xt_standard_target *t = (void *)ipt_get_target_c(s);
- if (strcmp(t->target.u.kernel.target->name, IPT_ERROR_TARGET) == 0) {
+ if (strcmp(t->target.u.kernel.target->name, XT_ERROR_TARGET) == 0) {
/* Head of user chain: ERROR target with chainname */
*chainname = t->target.data;
(*rulenum) = 0;
@@ -241,7 +241,7 @@ get_chainname_rulenum(const struct ipt_entry *s, const struct ipt_entry *e,
if (s->target_offset == sizeof(struct ipt_entry) &&
strcmp(t->target.u.kernel.target->name,
- IPT_STANDARD_TARGET) == 0 &&
+ XT_STANDARD_TARGET) == 0 &&
t->verdict < 0 &&
unconditional(&s->ip)) {
/* Tail of chains: STANDARD target (return/policy) */
@@ -346,7 +346,7 @@ ipt_do_table(struct sk_buff *skb,
get_entry(table_base, private->underflow[hook]));
do {
- const struct ipt_entry_target *t;
+ const struct xt_entry_target *t;
const struct xt_entry_match *ematch;
IP_NF_ASSERT(e);
@@ -364,7 +364,7 @@ ipt_do_table(struct sk_buff *skb,
goto no_match;
}
- ADD_COUNTER(e->counters, ntohs(ip->tot_len), 1);
+ ADD_COUNTER(e->counters, skb->len, 1);
t = ipt_get_target(e);
IP_NF_ASSERT(t->u.kernel.target);
@@ -380,10 +380,10 @@ ipt_do_table(struct sk_buff *skb,
if (!t->u.kernel.target->target) {
int v;
- v = ((struct ipt_standard_target *)t)->verdict;
+ v = ((struct xt_standard_target *)t)->verdict;
if (v < 0) {
/* Pop from stack? */
- if (v != IPT_RETURN) {
+ if (v != XT_RETURN) {
verdict = (unsigned)(-v) - 1;
break;
}
@@ -421,7 +421,7 @@ ipt_do_table(struct sk_buff *skb,
verdict = t->u.kernel.target->target(skb, &acpar);
/* Target might have changed stuff. */
ip = ip_hdr(skb);
- if (verdict == IPT_CONTINUE)
+ if (verdict == XT_CONTINUE)
e = ipt_next_entry(e);
else
/* Verdict */
@@ -461,7 +461,7 @@ mark_source_chains(const struct xt_table_info *newinfo,
e->counters.pcnt = pos;
for (;;) {
- const struct ipt_standard_target *t
+ const struct xt_standard_target *t
= (void *)ipt_get_target_c(e);
int visited = e->comefrom & (1 << hook);
@@ -475,13 +475,13 @@ mark_source_chains(const struct xt_table_info *newinfo,
/* Unconditional return/END. */
if ((e->target_offset == sizeof(struct ipt_entry) &&
(strcmp(t->target.u.user.name,
- IPT_STANDARD_TARGET) == 0) &&
+ XT_STANDARD_TARGET) == 0) &&
t->verdict < 0 && unconditional(&e->ip)) ||
visited) {
unsigned int oldpos, size;
if ((strcmp(t->target.u.user.name,
- IPT_STANDARD_TARGET) == 0) &&
+ XT_STANDARD_TARGET) == 0) &&
t->verdict < -NF_MAX_VERDICT - 1) {
duprintf("mark_source_chains: bad "
"negative verdict (%i)\n",
@@ -524,7 +524,7 @@ mark_source_chains(const struct xt_table_info *newinfo,
int newpos = t->verdict;
if (strcmp(t->target.u.user.name,
- IPT_STANDARD_TARGET) == 0 &&
+ XT_STANDARD_TARGET) == 0 &&
newpos >= 0) {
if (newpos > newinfo->size -
sizeof(struct ipt_entry)) {
@@ -552,7 +552,7 @@ mark_source_chains(const struct xt_table_info *newinfo,
return 1;
}
-static void cleanup_match(struct ipt_entry_match *m, struct net *net)
+static void cleanup_match(struct xt_entry_match *m, struct net *net)
{
struct xt_mtdtor_param par;
@@ -568,14 +568,14 @@ static void cleanup_match(struct ipt_entry_match *m, struct net *net)
static int
check_entry(const struct ipt_entry *e, const char *name)
{
- const struct ipt_entry_target *t;
+ const struct xt_entry_target *t;
if (!ip_checkentry(&e->ip)) {
duprintf("ip check failed %p %s.\n", e, par->match->name);
return -EINVAL;
}
- if (e->target_offset + sizeof(struct ipt_entry_target) >
+ if (e->target_offset + sizeof(struct xt_entry_target) >
e->next_offset)
return -EINVAL;
@@ -587,7 +587,7 @@ check_entry(const struct ipt_entry *e, const char *name)
}
static int
-check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par)
+check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
{
const struct ipt_ip *ip = par->entryinfo;
int ret;
@@ -605,7 +605,7 @@ check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par)
}
static int
-find_check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par)
+find_check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
{
struct xt_match *match;
int ret;
@@ -630,7 +630,7 @@ err:
static int check_target(struct ipt_entry *e, struct net *net, const char *name)
{
- struct ipt_entry_target *t = ipt_get_target(e);
+ struct xt_entry_target *t = ipt_get_target(e);
struct xt_tgchk_param par = {
.net = net,
.table = name,
@@ -656,7 +656,7 @@ static int
find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
unsigned int size)
{
- struct ipt_entry_target *t;
+ struct xt_entry_target *t;
struct xt_target *target;
int ret;
unsigned int j;
@@ -707,7 +707,7 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
static bool check_underflow(const struct ipt_entry *e)
{
- const struct ipt_entry_target *t;
+ const struct xt_entry_target *t;
unsigned int verdict;
if (!unconditional(&e->ip))
@@ -715,7 +715,7 @@ static bool check_underflow(const struct ipt_entry *e)
t = ipt_get_target_c(e);
if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0)
return false;
- verdict = ((struct ipt_standard_target *)t)->verdict;
+ verdict = ((struct xt_standard_target *)t)->verdict;
verdict = -verdict - 1;
return verdict == NF_DROP || verdict == NF_ACCEPT;
}
@@ -738,7 +738,7 @@ check_entry_size_and_hooks(struct ipt_entry *e,
}
if (e->next_offset
- < sizeof(struct ipt_entry) + sizeof(struct ipt_entry_target)) {
+ < sizeof(struct ipt_entry) + sizeof(struct xt_entry_target)) {
duprintf("checking: element %p size %u\n",
e, e->next_offset);
return -EINVAL;
@@ -771,7 +771,7 @@ static void
cleanup_entry(struct ipt_entry *e, struct net *net)
{
struct xt_tgdtor_param par;
- struct ipt_entry_target *t;
+ struct xt_entry_target *t;
struct xt_entry_match *ematch;
/* Cleanup all matches */
@@ -884,7 +884,7 @@ get_counters(const struct xt_table_info *t,
struct ipt_entry *iter;
unsigned int cpu;
unsigned int i;
- unsigned int curcpu;
+ unsigned int curcpu = get_cpu();
/* Instead of clearing (by a previous call to memset())
* the counters and using adds, we set the counters
@@ -894,19 +894,22 @@ get_counters(const struct xt_table_info *t,
* if new softirq were to run and call ipt_do_table
*/
local_bh_disable();
- curcpu = smp_processor_id();
-
i = 0;
xt_entry_foreach(iter, t->entries[curcpu], t->size) {
SET_COUNTER(counters[i], iter->counters.bcnt,
iter->counters.pcnt);
++i;
}
+ local_bh_enable();
+ /* Processing counters from other cpus, we can let bottom half enabled,
+ * (preemption is disabled)
+ */
for_each_possible_cpu(cpu) {
if (cpu == curcpu)
continue;
i = 0;
+ local_bh_disable();
xt_info_wrlock(cpu);
xt_entry_foreach(iter, t->entries[cpu], t->size) {
ADD_COUNTER(counters[i], iter->counters.bcnt,
@@ -914,8 +917,9 @@ get_counters(const struct xt_table_info *t,
++i; /* macro does multi eval of i */
}
xt_info_wrunlock(cpu);
+ local_bh_enable();
}
- local_bh_enable();
+ put_cpu();
}
static struct xt_counters *alloc_counters(const struct xt_table *table)
@@ -928,7 +932,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table)
(other than comefrom, which userspace doesn't care
about). */
countersize = sizeof(struct xt_counters) * private->number;
- counters = vmalloc_node(countersize, numa_node_id());
+ counters = vmalloc(countersize);
if (counters == NULL)
return ERR_PTR(-ENOMEM);
@@ -968,8 +972,8 @@ copy_entries_to_user(unsigned int total_size,
/* ... then go back and fix counters and names */
for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){
unsigned int i;
- const struct ipt_entry_match *m;
- const struct ipt_entry_target *t;
+ const struct xt_entry_match *m;
+ const struct xt_entry_target *t;
e = (struct ipt_entry *)(loc_cpu_entry + off);
if (copy_to_user(userptr + off
@@ -986,7 +990,7 @@ copy_entries_to_user(unsigned int total_size,
m = (void *)e + i;
if (copy_to_user(userptr + off + i
- + offsetof(struct ipt_entry_match,
+ + offsetof(struct xt_entry_match,
u.user.name),
m->u.kernel.match->name,
strlen(m->u.kernel.match->name)+1)
@@ -998,7 +1002,7 @@ copy_entries_to_user(unsigned int total_size,
t = ipt_get_target_c(e);
if (copy_to_user(userptr + off + e->target_offset
- + offsetof(struct ipt_entry_target,
+ + offsetof(struct xt_entry_target,
u.user.name),
t->u.kernel.target->name,
strlen(t->u.kernel.target->name)+1) != 0) {
@@ -1036,7 +1040,7 @@ static int compat_calc_entry(const struct ipt_entry *e,
const void *base, struct xt_table_info *newinfo)
{
const struct xt_entry_match *ematch;
- const struct ipt_entry_target *t;
+ const struct xt_entry_target *t;
unsigned int entry_offset;
int off, i, ret;
@@ -1088,7 +1092,7 @@ static int compat_table_info(const struct xt_table_info *info,
static int get_info(struct net *net, void __user *user,
const int *len, int compat)
{
- char name[IPT_TABLE_MAXNAMELEN];
+ char name[XT_TABLE_MAXNAMELEN];
struct xt_table *t;
int ret;
@@ -1101,7 +1105,7 @@ static int get_info(struct net *net, void __user *user,
if (copy_from_user(name, user, sizeof(name)) != 0)
return -EFAULT;
- name[IPT_TABLE_MAXNAMELEN-1] = '\0';
+ name[XT_TABLE_MAXNAMELEN-1] = '\0';
#ifdef CONFIG_COMPAT
if (compat)
xt_compat_lock(AF_INET);
@@ -1352,7 +1356,7 @@ do_add_counters(struct net *net, const void __user *user,
if (len != size + num_counters * sizeof(struct xt_counters))
return -EINVAL;
- paddc = vmalloc_node(len - size, numa_node_id());
+ paddc = vmalloc(len - size);
if (!paddc)
return -ENOMEM;
@@ -1396,14 +1400,14 @@ do_add_counters(struct net *net, const void __user *user,
#ifdef CONFIG_COMPAT
struct compat_ipt_replace {
- char name[IPT_TABLE_MAXNAMELEN];
+ char name[XT_TABLE_MAXNAMELEN];
u32 valid_hooks;
u32 num_entries;
u32 size;
u32 hook_entry[NF_INET_NUMHOOKS];
u32 underflow[NF_INET_NUMHOOKS];
u32 num_counters;
- compat_uptr_t counters; /* struct ipt_counters * */
+ compat_uptr_t counters; /* struct xt_counters * */
struct compat_ipt_entry entries[0];
};
@@ -1412,7 +1416,7 @@ compat_copy_entry_to_user(struct ipt_entry *e, void __user **dstptr,
unsigned int *size, struct xt_counters *counters,
unsigned int i)
{
- struct ipt_entry_target *t;
+ struct xt_entry_target *t;
struct compat_ipt_entry __user *ce;
u_int16_t target_offset, next_offset;
compat_uint_t origsize;
@@ -1447,7 +1451,7 @@ compat_copy_entry_to_user(struct ipt_entry *e, void __user **dstptr,
}
static int
-compat_find_calc_match(struct ipt_entry_match *m,
+compat_find_calc_match(struct xt_entry_match *m,
const char *name,
const struct ipt_ip *ip,
unsigned int hookmask,
@@ -1469,7 +1473,7 @@ compat_find_calc_match(struct ipt_entry_match *m,
static void compat_release_entry(struct compat_ipt_entry *e)
{
- struct ipt_entry_target *t;
+ struct xt_entry_target *t;
struct xt_entry_match *ematch;
/* Cleanup all matches */
@@ -1490,7 +1494,7 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e,
const char *name)
{
struct xt_entry_match *ematch;
- struct ipt_entry_target *t;
+ struct xt_entry_target *t;
struct xt_target *target;
unsigned int entry_offset;
unsigned int j;
@@ -1572,7 +1576,7 @@ compat_copy_entry_from_user(struct compat_ipt_entry *e, void **dstptr,
unsigned int *size, const char *name,
struct xt_table_info *newinfo, unsigned char *base)
{
- struct ipt_entry_target *t;
+ struct xt_entry_target *t;
struct xt_target *target;
struct ipt_entry *de;
unsigned int origsize;
@@ -1747,6 +1751,9 @@ translate_compat_table(struct net *net,
if (ret != 0)
break;
++i;
+ if (strcmp(ipt_get_target(iter1)->u.user.name,
+ XT_ERROR_TARGET) == 0)
+ ++newinfo->stacksize;
}
if (ret) {
/*
@@ -1877,7 +1884,7 @@ compat_do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user,
}
struct compat_ipt_get_entries {
- char name[IPT_TABLE_MAXNAMELEN];
+ char name[XT_TABLE_MAXNAMELEN];
compat_uint_t size;
struct compat_ipt_entry entrytable[0];
};
@@ -2032,7 +2039,7 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
case IPT_SO_GET_REVISION_MATCH:
case IPT_SO_GET_REVISION_TARGET: {
- struct ipt_get_revision rev;
+ struct xt_get_revision rev;
int target;
if (*len != sizeof(rev)) {
@@ -2169,7 +2176,7 @@ static int icmp_checkentry(const struct xt_mtchk_param *par)
static struct xt_target ipt_builtin_tg[] __read_mostly = {
{
- .name = IPT_STANDARD_TARGET,
+ .name = XT_STANDARD_TARGET,
.targetsize = sizeof(int),
.family = NFPROTO_IPV4,
#ifdef CONFIG_COMPAT
@@ -2179,9 +2186,9 @@ static struct xt_target ipt_builtin_tg[] __read_mostly = {
#endif
},
{
- .name = IPT_ERROR_TARGET,
+ .name = XT_ERROR_TARGET,
.target = ipt_error,
- .targetsize = IPT_FUNCTION_MAXNAMELEN,
+ .targetsize = XT_FUNCTION_MAXNAMELEN,
.family = NFPROTO_IPV4,
},
};
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index f91c94b9a790..1e26a4897655 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -29,6 +29,7 @@
#include <net/netfilter/nf_conntrack.h>
#include <net/net_namespace.h>
#include <net/checksum.h>
+#include <net/ip.h>
#define CLUSTERIP_VERSION "0.8"
@@ -53,12 +54,13 @@ struct clusterip_config {
#endif
enum clusterip_hashmode hash_mode; /* which hashing mode */
u_int32_t hash_initval; /* hash initialization */
+ struct rcu_head rcu;
};
static LIST_HEAD(clusterip_configs);
/* clusterip_lock protects the clusterip_configs list */
-static DEFINE_RWLOCK(clusterip_lock);
+static DEFINE_SPINLOCK(clusterip_lock);
#ifdef CONFIG_PROC_FS
static const struct file_operations clusterip_proc_fops;
@@ -71,11 +73,17 @@ clusterip_config_get(struct clusterip_config *c)
atomic_inc(&c->refcount);
}
+
+static void clusterip_config_rcu_free(struct rcu_head *head)
+{
+ kfree(container_of(head, struct clusterip_config, rcu));
+}
+
static inline void
clusterip_config_put(struct clusterip_config *c)
{
if (atomic_dec_and_test(&c->refcount))
- kfree(c);
+ call_rcu_bh(&c->rcu, clusterip_config_rcu_free);
}
/* decrease the count of entries using/referencing this config. If last
@@ -84,10 +92,11 @@ clusterip_config_put(struct clusterip_config *c)
static inline void
clusterip_config_entry_put(struct clusterip_config *c)
{
- write_lock_bh(&clusterip_lock);
- if (atomic_dec_and_test(&c->entries)) {
- list_del(&c->list);
- write_unlock_bh(&clusterip_lock);
+ local_bh_disable();
+ if (atomic_dec_and_lock(&c->entries, &clusterip_lock)) {
+ list_del_rcu(&c->list);
+ spin_unlock(&clusterip_lock);
+ local_bh_enable();
dev_mc_del(c->dev, c->clustermac);
dev_put(c->dev);
@@ -100,7 +109,7 @@ clusterip_config_entry_put(struct clusterip_config *c)
#endif
return;
}
- write_unlock_bh(&clusterip_lock);
+ local_bh_enable();
}
static struct clusterip_config *
@@ -108,7 +117,7 @@ __clusterip_config_find(__be32 clusterip)
{
struct clusterip_config *c;
- list_for_each_entry(c, &clusterip_configs, list) {
+ list_for_each_entry_rcu(c, &clusterip_configs, list) {
if (c->clusterip == clusterip)
return c;
}
@@ -121,16 +130,15 @@ clusterip_config_find_get(__be32 clusterip, int entry)
{
struct clusterip_config *c;
- read_lock_bh(&clusterip_lock);
+ rcu_read_lock_bh();
c = __clusterip_config_find(clusterip);
- if (!c) {
- read_unlock_bh(&clusterip_lock);
- return NULL;
+ if (c) {
+ if (unlikely(!atomic_inc_not_zero(&c->refcount)))
+ c = NULL;
+ else if (entry)
+ atomic_inc(&c->entries);
}
- atomic_inc(&c->refcount);
- if (entry)
- atomic_inc(&c->entries);
- read_unlock_bh(&clusterip_lock);
+ rcu_read_unlock_bh();
return c;
}
@@ -181,9 +189,9 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip,
}
#endif
- write_lock_bh(&clusterip_lock);
- list_add(&c->list, &clusterip_configs);
- write_unlock_bh(&clusterip_lock);
+ spin_lock_bh(&clusterip_lock);
+ list_add_rcu(&c->list, &clusterip_configs);
+ spin_unlock_bh(&clusterip_lock);
return c;
}
@@ -224,24 +232,22 @@ clusterip_hashfn(const struct sk_buff *skb,
{
const struct iphdr *iph = ip_hdr(skb);
unsigned long hashval;
- u_int16_t sport, dport;
- const u_int16_t *ports;
-
- switch (iph->protocol) {
- case IPPROTO_TCP:
- case IPPROTO_UDP:
- case IPPROTO_UDPLITE:
- case IPPROTO_SCTP:
- case IPPROTO_DCCP:
- case IPPROTO_ICMP:
- ports = (const void *)iph+iph->ihl*4;
- sport = ports[0];
- dport = ports[1];
- break;
- default:
+ u_int16_t sport = 0, dport = 0;
+ int poff;
+
+ poff = proto_ports_offset(iph->protocol);
+ if (poff >= 0) {
+ const u_int16_t *ports;
+ u16 _ports[2];
+
+ ports = skb_header_pointer(skb, iph->ihl * 4 + poff, 4, _ports);
+ if (ports) {
+ sport = ports[0];
+ dport = ports[1];
+ }
+ } else {
if (net_ratelimit())
pr_info("unknown protocol %u\n", iph->protocol);
- sport = dport = 0;
}
switch (config->hash_mode) {
@@ -462,7 +468,7 @@ struct arp_payload {
__be32 src_ip;
u_int8_t dst_hw[ETH_ALEN];
__be32 dst_ip;
-} __attribute__ ((packed));
+} __packed;
#ifdef DEBUG
static void arp_print(struct arp_payload *payload)
@@ -733,6 +739,9 @@ static void __exit clusterip_tg_exit(void)
#endif
nf_unregister_hook(&cip_arp_ops);
xt_unregister_target(&clusterip_tg_reg);
+
+ /* Wait for completion of call_rcu_bh()'s (clusterip_config_rcu_free) */
+ rcu_barrier_bh();
}
module_init(clusterip_tg_init);
diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c
index 5234f4f3499a..72ffc8fda2e9 100644
--- a/net/ipv4/netfilter/ipt_LOG.c
+++ b/net/ipv4/netfilter/ipt_LOG.c
@@ -13,6 +13,7 @@
#include <linux/module.h>
#include <linux/spinlock.h>
#include <linux/skbuff.h>
+#include <linux/if_arp.h>
#include <linux/ip.h>
#include <net/icmp.h>
#include <net/udp.h>
@@ -23,16 +24,15 @@
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter_ipv4/ipt_LOG.h>
#include <net/netfilter/nf_log.h>
+#include <net/netfilter/xt_log.h>
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
MODULE_DESCRIPTION("Xtables: IPv4 packet logging to syslog");
-/* Use lock to serialize, so printks don't overlap */
-static DEFINE_SPINLOCK(log_lock);
-
/* One level of recursion won't kill us */
-static void dump_packet(const struct nf_loginfo *info,
+static void dump_packet(struct sbuff *m,
+ const struct nf_loginfo *info,
const struct sk_buff *skb,
unsigned int iphoff)
{
@@ -47,32 +47,32 @@ static void dump_packet(const struct nf_loginfo *info,
ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph);
if (ih == NULL) {
- printk("TRUNCATED");
+ sb_add(m, "TRUNCATED");
return;
}
/* Important fields:
* TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */
/* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */
- printk("SRC=%pI4 DST=%pI4 ",
+ sb_add(m, "SRC=%pI4 DST=%pI4 ",
&ih->saddr, &ih->daddr);
/* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */
- printk("LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ",
+ sb_add(m, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ",
ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK,
ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id));
/* Max length: 6 "CE DF MF " */
if (ntohs(ih->frag_off) & IP_CE)
- printk("CE ");
+ sb_add(m, "CE ");
if (ntohs(ih->frag_off) & IP_DF)
- printk("DF ");
+ sb_add(m, "DF ");
if (ntohs(ih->frag_off) & IP_MF)
- printk("MF ");
+ sb_add(m, "MF ");
/* Max length: 11 "FRAG:65535 " */
if (ntohs(ih->frag_off) & IP_OFFSET)
- printk("FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET);
+ sb_add(m, "FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET);
if ((logflags & IPT_LOG_IPOPT) &&
ih->ihl * 4 > sizeof(struct iphdr)) {
@@ -84,15 +84,15 @@ static void dump_packet(const struct nf_loginfo *info,
op = skb_header_pointer(skb, iphoff+sizeof(_iph),
optsize, _opt);
if (op == NULL) {
- printk("TRUNCATED");
+ sb_add(m, "TRUNCATED");
return;
}
/* Max length: 127 "OPT (" 15*4*2chars ") " */
- printk("OPT (");
+ sb_add(m, "OPT (");
for (i = 0; i < optsize; i++)
- printk("%02X", op[i]);
- printk(") ");
+ sb_add(m, "%02X", op[i]);
+ sb_add(m, ") ");
}
switch (ih->protocol) {
@@ -101,7 +101,7 @@ static void dump_packet(const struct nf_loginfo *info,
const struct tcphdr *th;
/* Max length: 10 "PROTO=TCP " */
- printk("PROTO=TCP ");
+ sb_add(m, "PROTO=TCP ");
if (ntohs(ih->frag_off) & IP_OFFSET)
break;
@@ -110,41 +110,41 @@ static void dump_packet(const struct nf_loginfo *info,
th = skb_header_pointer(skb, iphoff + ih->ihl * 4,
sizeof(_tcph), &_tcph);
if (th == NULL) {
- printk("INCOMPLETE [%u bytes] ",
+ sb_add(m, "INCOMPLETE [%u bytes] ",
skb->len - iphoff - ih->ihl*4);
break;
}
/* Max length: 20 "SPT=65535 DPT=65535 " */
- printk("SPT=%u DPT=%u ",
+ sb_add(m, "SPT=%u DPT=%u ",
ntohs(th->source), ntohs(th->dest));
/* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */
if (logflags & IPT_LOG_TCPSEQ)
- printk("SEQ=%u ACK=%u ",
+ sb_add(m, "SEQ=%u ACK=%u ",
ntohl(th->seq), ntohl(th->ack_seq));
/* Max length: 13 "WINDOW=65535 " */
- printk("WINDOW=%u ", ntohs(th->window));
+ sb_add(m, "WINDOW=%u ", ntohs(th->window));
/* Max length: 9 "RES=0x3F " */
- printk("RES=0x%02x ", (u8)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22));
+ sb_add(m, "RES=0x%02x ", (u8)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22));
/* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */
if (th->cwr)
- printk("CWR ");
+ sb_add(m, "CWR ");
if (th->ece)
- printk("ECE ");
+ sb_add(m, "ECE ");
if (th->urg)
- printk("URG ");
+ sb_add(m, "URG ");
if (th->ack)
- printk("ACK ");
+ sb_add(m, "ACK ");
if (th->psh)
- printk("PSH ");
+ sb_add(m, "PSH ");
if (th->rst)
- printk("RST ");
+ sb_add(m, "RST ");
if (th->syn)
- printk("SYN ");
+ sb_add(m, "SYN ");
if (th->fin)
- printk("FIN ");
+ sb_add(m, "FIN ");
/* Max length: 11 "URGP=65535 " */
- printk("URGP=%u ", ntohs(th->urg_ptr));
+ sb_add(m, "URGP=%u ", ntohs(th->urg_ptr));
if ((logflags & IPT_LOG_TCPOPT) &&
th->doff * 4 > sizeof(struct tcphdr)) {
@@ -157,15 +157,15 @@ static void dump_packet(const struct nf_loginfo *info,
iphoff+ih->ihl*4+sizeof(_tcph),
optsize, _opt);
if (op == NULL) {
- printk("TRUNCATED");
+ sb_add(m, "TRUNCATED");
return;
}
/* Max length: 127 "OPT (" 15*4*2chars ") " */
- printk("OPT (");
+ sb_add(m, "OPT (");
for (i = 0; i < optsize; i++)
- printk("%02X", op[i]);
- printk(") ");
+ sb_add(m, "%02X", op[i]);
+ sb_add(m, ") ");
}
break;
}
@@ -176,9 +176,9 @@ static void dump_packet(const struct nf_loginfo *info,
if (ih->protocol == IPPROTO_UDP)
/* Max length: 10 "PROTO=UDP " */
- printk("PROTO=UDP " );
+ sb_add(m, "PROTO=UDP " );
else /* Max length: 14 "PROTO=UDPLITE " */
- printk("PROTO=UDPLITE ");
+ sb_add(m, "PROTO=UDPLITE ");
if (ntohs(ih->frag_off) & IP_OFFSET)
break;
@@ -187,13 +187,13 @@ static void dump_packet(const struct nf_loginfo *info,
uh = skb_header_pointer(skb, iphoff+ih->ihl*4,
sizeof(_udph), &_udph);
if (uh == NULL) {
- printk("INCOMPLETE [%u bytes] ",
+ sb_add(m, "INCOMPLETE [%u bytes] ",
skb->len - iphoff - ih->ihl*4);
break;
}
/* Max length: 20 "SPT=65535 DPT=65535 " */
- printk("SPT=%u DPT=%u LEN=%u ",
+ sb_add(m, "SPT=%u DPT=%u LEN=%u ",
ntohs(uh->source), ntohs(uh->dest),
ntohs(uh->len));
break;
@@ -220,7 +220,7 @@ static void dump_packet(const struct nf_loginfo *info,
[ICMP_ADDRESSREPLY] = 12 };
/* Max length: 11 "PROTO=ICMP " */
- printk("PROTO=ICMP ");
+ sb_add(m, "PROTO=ICMP ");
if (ntohs(ih->frag_off) & IP_OFFSET)
break;
@@ -229,19 +229,19 @@ static void dump_packet(const struct nf_loginfo *info,
ich = skb_header_pointer(skb, iphoff + ih->ihl * 4,
sizeof(_icmph), &_icmph);
if (ich == NULL) {
- printk("INCOMPLETE [%u bytes] ",
+ sb_add(m, "INCOMPLETE [%u bytes] ",
skb->len - iphoff - ih->ihl*4);
break;
}
/* Max length: 18 "TYPE=255 CODE=255 " */
- printk("TYPE=%u CODE=%u ", ich->type, ich->code);
+ sb_add(m, "TYPE=%u CODE=%u ", ich->type, ich->code);
/* Max length: 25 "INCOMPLETE [65535 bytes] " */
if (ich->type <= NR_ICMP_TYPES &&
required_len[ich->type] &&
skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) {
- printk("INCOMPLETE [%u bytes] ",
+ sb_add(m, "INCOMPLETE [%u bytes] ",
skb->len - iphoff - ih->ihl*4);
break;
}
@@ -250,35 +250,35 @@ static void dump_packet(const struct nf_loginfo *info,
case ICMP_ECHOREPLY:
case ICMP_ECHO:
/* Max length: 19 "ID=65535 SEQ=65535 " */
- printk("ID=%u SEQ=%u ",
+ sb_add(m, "ID=%u SEQ=%u ",
ntohs(ich->un.echo.id),
ntohs(ich->un.echo.sequence));
break;
case ICMP_PARAMETERPROB:
/* Max length: 14 "PARAMETER=255 " */
- printk("PARAMETER=%u ",
+ sb_add(m, "PARAMETER=%u ",
ntohl(ich->un.gateway) >> 24);
break;
case ICMP_REDIRECT:
/* Max length: 24 "GATEWAY=255.255.255.255 " */
- printk("GATEWAY=%pI4 ", &ich->un.gateway);
+ sb_add(m, "GATEWAY=%pI4 ", &ich->un.gateway);
/* Fall through */
case ICMP_DEST_UNREACH:
case ICMP_SOURCE_QUENCH:
case ICMP_TIME_EXCEEDED:
/* Max length: 3+maxlen */
if (!iphoff) { /* Only recurse once. */
- printk("[");
- dump_packet(info, skb,
+ sb_add(m, "[");
+ dump_packet(m, info, skb,
iphoff + ih->ihl*4+sizeof(_icmph));
- printk("] ");
+ sb_add(m, "] ");
}
/* Max length: 10 "MTU=65535 " */
if (ich->type == ICMP_DEST_UNREACH &&
ich->code == ICMP_FRAG_NEEDED)
- printk("MTU=%u ", ntohs(ich->un.frag.mtu));
+ sb_add(m, "MTU=%u ", ntohs(ich->un.frag.mtu));
}
break;
}
@@ -291,19 +291,19 @@ static void dump_packet(const struct nf_loginfo *info,
break;
/* Max length: 9 "PROTO=AH " */
- printk("PROTO=AH ");
+ sb_add(m, "PROTO=AH ");
/* Max length: 25 "INCOMPLETE [65535 bytes] " */
ah = skb_header_pointer(skb, iphoff+ih->ihl*4,
sizeof(_ahdr), &_ahdr);
if (ah == NULL) {
- printk("INCOMPLETE [%u bytes] ",
+ sb_add(m, "INCOMPLETE [%u bytes] ",
skb->len - iphoff - ih->ihl*4);
break;
}
/* Length: 15 "SPI=0xF1234567 " */
- printk("SPI=0x%x ", ntohl(ah->spi));
+ sb_add(m, "SPI=0x%x ", ntohl(ah->spi));
break;
}
case IPPROTO_ESP: {
@@ -311,7 +311,7 @@ static void dump_packet(const struct nf_loginfo *info,
const struct ip_esp_hdr *eh;
/* Max length: 10 "PROTO=ESP " */
- printk("PROTO=ESP ");
+ sb_add(m, "PROTO=ESP ");
if (ntohs(ih->frag_off) & IP_OFFSET)
break;
@@ -320,25 +320,25 @@ static void dump_packet(const struct nf_loginfo *info,
eh = skb_header_pointer(skb, iphoff+ih->ihl*4,
sizeof(_esph), &_esph);
if (eh == NULL) {
- printk("INCOMPLETE [%u bytes] ",
+ sb_add(m, "INCOMPLETE [%u bytes] ",
skb->len - iphoff - ih->ihl*4);
break;
}
/* Length: 15 "SPI=0xF1234567 " */
- printk("SPI=0x%x ", ntohl(eh->spi));
+ sb_add(m, "SPI=0x%x ", ntohl(eh->spi));
break;
}
/* Max length: 10 "PROTO 255 " */
default:
- printk("PROTO=%u ", ih->protocol);
+ sb_add(m, "PROTO=%u ", ih->protocol);
}
/* Max length: 15 "UID=4294967295 " */
if ((logflags & IPT_LOG_UID) && !iphoff && skb->sk) {
read_lock_bh(&skb->sk->sk_callback_lock);
if (skb->sk->sk_socket && skb->sk->sk_socket->file)
- printk("UID=%u GID=%u ",
+ sb_add(m, "UID=%u GID=%u ",
skb->sk->sk_socket->file->f_cred->fsuid,
skb->sk->sk_socket->file->f_cred->fsgid);
read_unlock_bh(&skb->sk->sk_callback_lock);
@@ -346,7 +346,7 @@ static void dump_packet(const struct nf_loginfo *info,
/* Max length: 16 "MARK=0xFFFFFFFF " */
if (!iphoff && skb->mark)
- printk("MARK=0x%x ", skb->mark);
+ sb_add(m, "MARK=0x%x ", skb->mark);
/* Proto Max log string length */
/* IP: 40+46+6+11+127 = 230 */
@@ -363,6 +363,43 @@ static void dump_packet(const struct nf_loginfo *info,
/* maxlen = 230+ 91 + 230 + 252 = 803 */
}
+static void dump_mac_header(struct sbuff *m,
+ const struct nf_loginfo *info,
+ const struct sk_buff *skb)
+{
+ struct net_device *dev = skb->dev;
+ unsigned int logflags = 0;
+
+ if (info->type == NF_LOG_TYPE_LOG)
+ logflags = info->u.log.logflags;
+
+ if (!(logflags & IPT_LOG_MACDECODE))
+ goto fallback;
+
+ switch (dev->type) {
+ case ARPHRD_ETHER:
+ sb_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ",
+ eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
+ ntohs(eth_hdr(skb)->h_proto));
+ return;
+ default:
+ break;
+ }
+
+fallback:
+ sb_add(m, "MAC=");
+ if (dev->hard_header_len &&
+ skb->mac_header != skb->network_header) {
+ const unsigned char *p = skb_mac_header(skb);
+ unsigned int i;
+
+ sb_add(m, "%02x", *p++);
+ for (i = 1; i < dev->hard_header_len; i++, p++)
+ sb_add(m, ":%02x", *p);
+ }
+ sb_add(m, " ");
+}
+
static struct nf_loginfo default_loginfo = {
.type = NF_LOG_TYPE_LOG,
.u = {
@@ -382,11 +419,12 @@ ipt_log_packet(u_int8_t pf,
const struct nf_loginfo *loginfo,
const char *prefix)
{
+ struct sbuff *m = sb_open();
+
if (!loginfo)
loginfo = &default_loginfo;
- spin_lock_bh(&log_lock);
- printk("<%d>%sIN=%s OUT=%s ", loginfo->u.log.level,
+ sb_add(m, "<%d>%sIN=%s OUT=%s ", loginfo->u.log.level,
prefix,
in ? in->name : "",
out ? out->name : "");
@@ -397,31 +435,20 @@ ipt_log_packet(u_int8_t pf,
physindev = skb->nf_bridge->physindev;
if (physindev && in != physindev)
- printk("PHYSIN=%s ", physindev->name);
+ sb_add(m, "PHYSIN=%s ", physindev->name);
physoutdev = skb->nf_bridge->physoutdev;
if (physoutdev && out != physoutdev)
- printk("PHYSOUT=%s ", physoutdev->name);
+ sb_add(m, "PHYSOUT=%s ", physoutdev->name);
}
#endif
- if (in && !out) {
- /* MAC logging for input chain only. */
- printk("MAC=");
- if (skb->dev && skb->dev->hard_header_len &&
- skb->mac_header != skb->network_header) {
- int i;
- const unsigned char *p = skb_mac_header(skb);
- for (i = 0; i < skb->dev->hard_header_len; i++,p++)
- printk("%02x%c", *p,
- i==skb->dev->hard_header_len - 1
- ? ' ':':');
- } else
- printk(" ");
- }
+ /* MAC logging for input path only. */
+ if (in && !out)
+ dump_mac_header(m, loginfo, skb);
+
+ dump_packet(m, loginfo, skb, 0);
- dump_packet(loginfo, skb, 0);
- printk("\n");
- spin_unlock_bh(&log_lock);
+ sb_close(m);
}
static unsigned int
diff --git a/net/ipv4/netfilter/ipt_NETMAP.c b/net/ipv4/netfilter/ipt_NETMAP.c
index f43867d1697f..6cdb298f1035 100644
--- a/net/ipv4/netfilter/ipt_NETMAP.c
+++ b/net/ipv4/netfilter/ipt_NETMAP.c
@@ -48,7 +48,8 @@ netmap_tg(struct sk_buff *skb, const struct xt_action_param *par)
NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING ||
par->hooknum == NF_INET_POST_ROUTING ||
- par->hooknum == NF_INET_LOCAL_OUT);
+ par->hooknum == NF_INET_LOCAL_OUT ||
+ par->hooknum == NF_INET_LOCAL_IN);
ct = nf_ct_get(skb, &ctinfo);
netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip);
@@ -77,7 +78,8 @@ static struct xt_target netmap_tg_reg __read_mostly = {
.table = "nat",
.hooks = (1 << NF_INET_PRE_ROUTING) |
(1 << NF_INET_POST_ROUTING) |
- (1 << NF_INET_LOCAL_OUT),
+ (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_LOCAL_IN),
.checkentry = netmap_tg_check,
.me = THIS_MODULE
};
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index f5f4a888e4ec..43eec80c0e7c 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -95,10 +95,11 @@ static void send_reset(struct sk_buff *oldskb, int hook)
}
tcph->rst = 1;
- tcph->check = tcp_v4_check(sizeof(struct tcphdr),
- niph->saddr, niph->daddr,
- csum_partial(tcph,
- sizeof(struct tcphdr), 0));
+ tcph->check = ~tcp_v4_check(sizeof(struct tcphdr), niph->saddr,
+ niph->daddr, 0);
+ nskb->ip_summed = CHECKSUM_PARTIAL;
+ nskb->csum_start = (unsigned char *)tcph - nskb->head;
+ nskb->csum_offset = offsetof(struct tcphdr, check);
addr_type = RTN_UNSPEC;
if (hook != NF_INET_FORWARD
@@ -109,13 +110,13 @@ static void send_reset(struct sk_buff *oldskb, int hook)
addr_type = RTN_LOCAL;
/* ip_route_me_harder expects skb->dst to be set */
- skb_dst_set(nskb, dst_clone(skb_dst(oldskb)));
+ skb_dst_set_noref(nskb, skb_dst(oldskb));
+ nskb->protocol = htons(ETH_P_IP);
if (ip_route_me_harder(nskb, addr_type))
goto free_nskb;
niph->ttl = dst_metric(skb_dst(nskb), RTAX_HOPLIMIT);
- nskb->ip_summed = CHECKSUM_NONE;
/* "Never happens" */
if (nskb->len > dst_mtu(skb_dst(nskb)))
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
index 244f7cb08d68..37f8adb68c79 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
@@ -11,6 +11,7 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/percpu.h>
+#include <linux/security.h>
#include <net/net_namespace.h>
#include <linux/netfilter.h>
@@ -87,6 +88,29 @@ static void ct_seq_stop(struct seq_file *s, void *v)
rcu_read_unlock();
}
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+static int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
+{
+ int ret;
+ u32 len;
+ char *secctx;
+
+ ret = security_secid_to_secctx(ct->secmark, &secctx, &len);
+ if (ret)
+ return ret;
+
+ ret = seq_printf(s, "secctx=%s ", secctx);
+
+ security_release_secctx(secctx, len);
+ return ret;
+}
+#else
+static inline int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
+{
+ return 0;
+}
+#endif
+
static int ct_seq_show(struct seq_file *s, void *v)
{
struct nf_conntrack_tuple_hash *hash = v;
@@ -148,10 +172,8 @@ static int ct_seq_show(struct seq_file *s, void *v)
goto release;
#endif
-#ifdef CONFIG_NF_CONNTRACK_SECMARK
- if (seq_printf(s, "secmark=%u ", ct->secmark))
+ if (ct_show_secctx(s, ct))
goto release;
-#endif
if (seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use)))
goto release;
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index cb763ae9ed90..f3a9b42b16c6 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -66,6 +66,13 @@ static unsigned int ipv4_conntrack_defrag(unsigned int hooknum,
const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
+ struct sock *sk = skb->sk;
+ struct inet_sock *inet = inet_sk(skb->sk);
+
+ if (sk && (sk->sk_family == PF_INET) &&
+ inet->nodefrag)
+ return NF_ACCEPT;
+
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
#if !defined(CONFIG_NF_NAT) && !defined(CONFIG_NF_NAT_MODULE)
/* Previously seen (loopback)? Ignore. Do this before
diff --git a/net/ipv4/netfilter/nf_nat_amanda.c b/net/ipv4/netfilter/nf_nat_amanda.c
index c31b87668250..0f23b3f06df0 100644
--- a/net/ipv4/netfilter/nf_nat_amanda.c
+++ b/net/ipv4/netfilter/nf_nat_amanda.c
@@ -44,9 +44,16 @@ static unsigned int help(struct sk_buff *skb,
/* Try to get same port: if not, try to change it. */
for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
+ int ret;
+
exp->tuple.dst.u.tcp.port = htons(port);
- if (nf_ct_expect_related(exp) == 0)
+ ret = nf_ct_expect_related(exp);
+ if (ret == 0)
+ break;
+ else if (ret != -EBUSY) {
+ port = 0;
break;
+ }
}
if (port == 0)
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
index 4f8bddb760c9..295c97431e43 100644
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -38,7 +38,7 @@ static DEFINE_SPINLOCK(nf_nat_lock);
static struct nf_conntrack_l3proto *l3proto __read_mostly;
#define MAX_IP_NAT_PROTO 256
-static const struct nf_nat_protocol *nf_nat_protos[MAX_IP_NAT_PROTO]
+static const struct nf_nat_protocol __rcu *nf_nat_protos[MAX_IP_NAT_PROTO]
__read_mostly;
static inline const struct nf_nat_protocol *
@@ -47,7 +47,7 @@ __nf_nat_proto_find(u_int8_t protonum)
return rcu_dereference(nf_nat_protos[protonum]);
}
-const struct nf_nat_protocol *
+static const struct nf_nat_protocol *
nf_nat_proto_find_get(u_int8_t protonum)
{
const struct nf_nat_protocol *p;
@@ -60,14 +60,12 @@ nf_nat_proto_find_get(u_int8_t protonum)
return p;
}
-EXPORT_SYMBOL_GPL(nf_nat_proto_find_get);
-void
+static void
nf_nat_proto_put(const struct nf_nat_protocol *p)
{
module_put(p->me);
}
-EXPORT_SYMBOL_GPL(nf_nat_proto_put);
/* We keep an extra hash for each conntrack, for fast searching. */
static inline unsigned int
@@ -261,17 +259,18 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
rcu_read_lock();
proto = __nf_nat_proto_find(orig_tuple->dst.protonum);
- /* Change protocol info to have some randomization */
- if (range->flags & IP_NAT_RANGE_PROTO_RANDOM) {
- proto->unique_tuple(tuple, range, maniptype, ct);
- goto out;
- }
-
/* Only bother mapping if it's not already in range and unique */
- if ((!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) ||
- proto->in_range(tuple, maniptype, &range->min, &range->max)) &&
- !nf_nat_used_tuple(tuple, ct))
- goto out;
+ if (!(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) {
+ if (range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) {
+ if (proto->in_range(tuple, maniptype, &range->min,
+ &range->max) &&
+ (range->min.all == range->max.all ||
+ !nf_nat_used_tuple(tuple, ct)))
+ goto out;
+ } else if (!nf_nat_used_tuple(tuple, ct)) {
+ goto out;
+ }
+ }
/* Last change: get protocol to try to obtain unique tuple. */
proto->unique_tuple(tuple, range, maniptype, ct);
@@ -440,7 +439,7 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct,
if (!skb_make_writable(skb, hdrlen + sizeof(*inside)))
return 0;
- inside = (void *)skb->data + ip_hdrlen(skb);
+ inside = (void *)skb->data + hdrlen;
/* We're actually going to mangle it beyond trivial checksum
adjustment, so make sure the current checksum is correct. */
@@ -463,6 +462,18 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct,
return 0;
}
+ if (manip == IP_NAT_MANIP_SRC)
+ statusbit = IPS_SRC_NAT;
+ else
+ statusbit = IPS_DST_NAT;
+
+ /* Invert if this is reply dir. */
+ if (dir == IP_CT_DIR_REPLY)
+ statusbit ^= IPS_NAT_MASK;
+
+ if (!(ct->status & statusbit))
+ return 1;
+
pr_debug("icmp_reply_translation: translating error %p manip %u "
"dir %s\n", skb, manip,
dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY");
@@ -470,12 +481,10 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct,
/* rcu_read_lock()ed by nf_hook_slow */
l4proto = __nf_ct_l4proto_find(PF_INET, inside->ip.protocol);
- if (!nf_ct_get_tuple(skb,
- ip_hdrlen(skb) + sizeof(struct icmphdr),
- (ip_hdrlen(skb) +
+ if (!nf_ct_get_tuple(skb, hdrlen + sizeof(struct icmphdr),
+ (hdrlen +
sizeof(struct icmphdr) + inside->ip.ihl * 4),
- (u_int16_t)AF_INET,
- inside->ip.protocol,
+ (u_int16_t)AF_INET, inside->ip.protocol,
&inner, l3proto, l4proto))
return 0;
@@ -484,15 +493,13 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct,
pass all hooks (locally-generated ICMP). Consider incoming
packet: PREROUTING (DST manip), routing produces ICMP, goes
through POSTROUTING (which must correct the DST manip). */
- if (!manip_pkt(inside->ip.protocol, skb,
- ip_hdrlen(skb) + sizeof(inside->icmp),
- &ct->tuplehash[!dir].tuple,
- !manip))
+ if (!manip_pkt(inside->ip.protocol, skb, hdrlen + sizeof(inside->icmp),
+ &ct->tuplehash[!dir].tuple, !manip))
return 0;
if (skb->ip_summed != CHECKSUM_PARTIAL) {
/* Reloading "inside" here since manip_pkt inner. */
- inside = (void *)skb->data + ip_hdrlen(skb);
+ inside = (void *)skb->data + hdrlen;
inside->icmp.checksum = 0;
inside->icmp.checksum =
csum_fold(skb_checksum(skb, hdrlen,
@@ -501,20 +508,9 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct,
/* Change outer to look the reply to an incoming packet
* (proto 0 means don't invert per-proto part). */
- if (manip == IP_NAT_MANIP_SRC)
- statusbit = IPS_SRC_NAT;
- else
- statusbit = IPS_DST_NAT;
-
- /* Invert if this is reply dir. */
- if (dir == IP_CT_DIR_REPLY)
- statusbit ^= IPS_NAT_MASK;
-
- if (ct->status & statusbit) {
- nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
- if (!manip_pkt(0, skb, 0, &target, manip))
- return 0;
- }
+ nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
+ if (!manip_pkt(0, skb, 0, &target, manip))
+ return 0;
return 1;
}
@@ -742,7 +738,7 @@ static int __init nf_nat_init(void)
spin_unlock_bh(&nf_nat_lock);
/* Initialize fake conntrack so that NAT will skip it */
- nf_conntrack_untracked.status |= IPS_NAT_DONE_MASK;
+ nf_ct_untracked_status_or(IPS_NAT_DONE_MASK);
l3proto = nf_ct_l3proto_find_get((u_int16_t)AF_INET);
diff --git a/net/ipv4/netfilter/nf_nat_ftp.c b/net/ipv4/netfilter/nf_nat_ftp.c
index 86e0e84ff0a0..dc73abb3fe27 100644
--- a/net/ipv4/netfilter/nf_nat_ftp.c
+++ b/net/ipv4/netfilter/nf_nat_ftp.c
@@ -79,9 +79,16 @@ static unsigned int nf_nat_ftp(struct sk_buff *skb,
/* Try to get same port: if not, try to change it. */
for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
+ int ret;
+
exp->tuple.dst.u.tcp.port = htons(port);
- if (nf_ct_expect_related(exp) == 0)
+ ret = nf_ct_expect_related(exp);
+ if (ret == 0)
+ break;
+ else if (ret != -EBUSY) {
+ port = 0;
break;
+ }
}
if (port == 0)
diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c
index 5045196d853c..790f3160e012 100644
--- a/net/ipv4/netfilter/nf_nat_h323.c
+++ b/net/ipv4/netfilter/nf_nat_h323.c
@@ -222,13 +222,24 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
/* Try to get a pair of ports. */
for (nated_port = ntohs(rtp_exp->tuple.dst.u.udp.port);
nated_port != 0; nated_port += 2) {
+ int ret;
+
rtp_exp->tuple.dst.u.udp.port = htons(nated_port);
- if (nf_ct_expect_related(rtp_exp) == 0) {
+ ret = nf_ct_expect_related(rtp_exp);
+ if (ret == 0) {
rtcp_exp->tuple.dst.u.udp.port =
htons(nated_port + 1);
- if (nf_ct_expect_related(rtcp_exp) == 0)
+ ret = nf_ct_expect_related(rtcp_exp);
+ if (ret == 0)
+ break;
+ else if (ret != -EBUSY) {
+ nf_ct_unexpect_related(rtp_exp);
+ nated_port = 0;
break;
- nf_ct_unexpect_related(rtp_exp);
+ }
+ } else if (ret != -EBUSY) {
+ nated_port = 0;
+ break;
}
}
@@ -284,9 +295,16 @@ static int nat_t120(struct sk_buff *skb, struct nf_conn *ct,
/* Try to get same port: if not, try to change it. */
for (; nated_port != 0; nated_port++) {
+ int ret;
+
exp->tuple.dst.u.tcp.port = htons(nated_port);
- if (nf_ct_expect_related(exp) == 0)
+ ret = nf_ct_expect_related(exp);
+ if (ret == 0)
+ break;
+ else if (ret != -EBUSY) {
+ nated_port = 0;
break;
+ }
}
if (nated_port == 0) { /* No port available */
@@ -334,9 +352,16 @@ static int nat_h245(struct sk_buff *skb, struct nf_conn *ct,
/* Try to get same port: if not, try to change it. */
for (; nated_port != 0; nated_port++) {
+ int ret;
+
exp->tuple.dst.u.tcp.port = htons(nated_port);
- if (nf_ct_expect_related(exp) == 0)
+ ret = nf_ct_expect_related(exp);
+ if (ret == 0)
break;
+ else if (ret != -EBUSY) {
+ nated_port = 0;
+ break;
+ }
}
if (nated_port == 0) { /* No port available */
@@ -418,9 +443,16 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,
/* Try to get same port: if not, try to change it. */
for (; nated_port != 0; nated_port++) {
+ int ret;
+
exp->tuple.dst.u.tcp.port = htons(nated_port);
- if (nf_ct_expect_related(exp) == 0)
+ ret = nf_ct_expect_related(exp);
+ if (ret == 0)
+ break;
+ else if (ret != -EBUSY) {
+ nated_port = 0;
break;
+ }
}
if (nated_port == 0) { /* No port available */
@@ -500,9 +532,16 @@ static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct,
/* Try to get same port: if not, try to change it. */
for (nated_port = ntohs(port); nated_port != 0; nated_port++) {
+ int ret;
+
exp->tuple.dst.u.tcp.port = htons(nated_port);
- if (nf_ct_expect_related(exp) == 0)
+ ret = nf_ct_expect_related(exp);
+ if (ret == 0)
break;
+ else if (ret != -EBUSY) {
+ nated_port = 0;
+ break;
+ }
}
if (nated_port == 0) { /* No port available */
diff --git a/net/ipv4/netfilter/nf_nat_helper.c b/net/ipv4/netfilter/nf_nat_helper.c
index 4a0c6b548eee..31427fb57aa8 100644
--- a/net/ipv4/netfilter/nf_nat_helper.c
+++ b/net/ipv4/netfilter/nf_nat_helper.c
@@ -153,6 +153,35 @@ void nf_nat_set_seq_adjust(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
}
EXPORT_SYMBOL_GPL(nf_nat_set_seq_adjust);
+static void nf_nat_csum(struct sk_buff *skb, struct iphdr *iph, void *data,
+ int datalen, __sum16 *check, int oldlen)
+{
+ struct rtable *rt = skb_rtable(skb);
+
+ if (skb->ip_summed != CHECKSUM_PARTIAL) {
+ if (!(rt->rt_flags & RTCF_LOCAL) &&
+ skb->dev->features & NETIF_F_V4_CSUM) {
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ skb->csum_start = skb_headroom(skb) +
+ skb_network_offset(skb) +
+ iph->ihl * 4;
+ skb->csum_offset = (void *)check - data;
+ *check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
+ datalen, iph->protocol, 0);
+ } else {
+ *check = 0;
+ *check = csum_tcpudp_magic(iph->saddr, iph->daddr,
+ datalen, iph->protocol,
+ csum_partial(data, datalen,
+ 0));
+ if (iph->protocol == IPPROTO_UDP && !*check)
+ *check = CSUM_MANGLED_0;
+ }
+ } else
+ inet_proto_csum_replace2(check, skb,
+ htons(oldlen), htons(datalen), 1);
+}
+
/* Generic function for mangling variable-length address changes inside
* NATed TCP connections (like the PORT XXX,XXX,XXX,XXX,XXX,XXX
* command in FTP).
@@ -169,7 +198,6 @@ int __nf_nat_mangle_tcp_packet(struct sk_buff *skb,
const char *rep_buffer,
unsigned int rep_len, bool adjust)
{
- struct rtable *rt = skb_rtable(skb);
struct iphdr *iph;
struct tcphdr *tcph;
int oldlen, datalen;
@@ -192,26 +220,7 @@ int __nf_nat_mangle_tcp_packet(struct sk_buff *skb,
match_offset, match_len, rep_buffer, rep_len);
datalen = skb->len - iph->ihl*4;
- if (skb->ip_summed != CHECKSUM_PARTIAL) {
- if (!(rt->rt_flags & RTCF_LOCAL) &&
- skb->dev->features & NETIF_F_V4_CSUM) {
- skb->ip_summed = CHECKSUM_PARTIAL;
- skb->csum_start = skb_headroom(skb) +
- skb_network_offset(skb) +
- iph->ihl * 4;
- skb->csum_offset = offsetof(struct tcphdr, check);
- tcph->check = ~tcp_v4_check(datalen,
- iph->saddr, iph->daddr, 0);
- } else {
- tcph->check = 0;
- tcph->check = tcp_v4_check(datalen,
- iph->saddr, iph->daddr,
- csum_partial(tcph,
- datalen, 0));
- }
- } else
- inet_proto_csum_replace2(&tcph->check, skb,
- htons(oldlen), htons(datalen), 1);
+ nf_nat_csum(skb, iph, tcph, datalen, &tcph->check, oldlen);
if (adjust && rep_len != match_len)
nf_nat_set_seq_adjust(ct, ctinfo, tcph->seq,
@@ -240,7 +249,6 @@ nf_nat_mangle_udp_packet(struct sk_buff *skb,
const char *rep_buffer,
unsigned int rep_len)
{
- struct rtable *rt = skb_rtable(skb);
struct iphdr *iph;
struct udphdr *udph;
int datalen, oldlen;
@@ -274,29 +282,7 @@ nf_nat_mangle_udp_packet(struct sk_buff *skb,
if (!udph->check && skb->ip_summed != CHECKSUM_PARTIAL)
return 1;
- if (skb->ip_summed != CHECKSUM_PARTIAL) {
- if (!(rt->rt_flags & RTCF_LOCAL) &&
- skb->dev->features & NETIF_F_V4_CSUM) {
- skb->ip_summed = CHECKSUM_PARTIAL;
- skb->csum_start = skb_headroom(skb) +
- skb_network_offset(skb) +
- iph->ihl * 4;
- skb->csum_offset = offsetof(struct udphdr, check);
- udph->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
- datalen, IPPROTO_UDP,
- 0);
- } else {
- udph->check = 0;
- udph->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
- datalen, IPPROTO_UDP,
- csum_partial(udph,
- datalen, 0));
- if (!udph->check)
- udph->check = CSUM_MANGLED_0;
- }
- } else
- inet_proto_csum_replace2(&udph->check, skb,
- htons(oldlen), htons(datalen), 1);
+ nf_nat_csum(skb, iph, udph, datalen, &udph->check, oldlen);
return 1;
}
diff --git a/net/ipv4/netfilter/nf_nat_irc.c b/net/ipv4/netfilter/nf_nat_irc.c
index ea83a886b03e..535e1a802356 100644
--- a/net/ipv4/netfilter/nf_nat_irc.c
+++ b/net/ipv4/netfilter/nf_nat_irc.c
@@ -45,9 +45,16 @@ static unsigned int help(struct sk_buff *skb,
/* Try to get same port: if not, try to change it. */
for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
+ int ret;
+
exp->tuple.dst.u.tcp.port = htons(port);
- if (nf_ct_expect_related(exp) == 0)
+ ret = nf_ct_expect_related(exp);
+ if (ret == 0)
+ break;
+ else if (ret != -EBUSY) {
+ port = 0;
break;
+ }
}
if (port == 0)
diff --git a/net/ipv4/netfilter/nf_nat_proto_common.c b/net/ipv4/netfilter/nf_nat_proto_common.c
index 6c4f11f51446..3e61faf23a9a 100644
--- a/net/ipv4/netfilter/nf_nat_proto_common.c
+++ b/net/ipv4/netfilter/nf_nat_proto_common.c
@@ -34,7 +34,7 @@ bool nf_nat_proto_in_range(const struct nf_conntrack_tuple *tuple,
}
EXPORT_SYMBOL_GPL(nf_nat_proto_in_range);
-bool nf_nat_proto_unique_tuple(struct nf_conntrack_tuple *tuple,
+void nf_nat_proto_unique_tuple(struct nf_conntrack_tuple *tuple,
const struct nf_nat_range *range,
enum nf_nat_manip_type maniptype,
const struct nf_conn *ct,
@@ -53,7 +53,7 @@ bool nf_nat_proto_unique_tuple(struct nf_conntrack_tuple *tuple,
if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) {
/* If it's dst rewrite, can't change port */
if (maniptype == IP_NAT_MANIP_DST)
- return false;
+ return;
if (ntohs(*portptr) < 1024) {
/* Loose convention: >> 512 is credential passing */
@@ -81,15 +81,15 @@ bool nf_nat_proto_unique_tuple(struct nf_conntrack_tuple *tuple,
else
off = *rover;
- for (i = 0; i < range_size; i++, off++) {
+ for (i = 0; ; ++off) {
*portptr = htons(min + off % range_size);
- if (nf_nat_used_tuple(tuple, ct))
+ if (++i != range_size && nf_nat_used_tuple(tuple, ct))
continue;
if (!(range->flags & IP_NAT_RANGE_PROTO_RANDOM))
*rover = off;
- return true;
+ return;
}
- return false;
+ return;
}
EXPORT_SYMBOL_GPL(nf_nat_proto_unique_tuple);
diff --git a/net/ipv4/netfilter/nf_nat_proto_dccp.c b/net/ipv4/netfilter/nf_nat_proto_dccp.c
index 22485ce306d4..570faf2667b2 100644
--- a/net/ipv4/netfilter/nf_nat_proto_dccp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_dccp.c
@@ -22,14 +22,14 @@
static u_int16_t dccp_port_rover;
-static bool
+static void
dccp_unique_tuple(struct nf_conntrack_tuple *tuple,
const struct nf_nat_range *range,
enum nf_nat_manip_type maniptype,
const struct nf_conn *ct)
{
- return nf_nat_proto_unique_tuple(tuple, range, maniptype, ct,
- &dccp_port_rover);
+ nf_nat_proto_unique_tuple(tuple, range, maniptype, ct,
+ &dccp_port_rover);
}
static bool
diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c
index d7e89201351e..bc8d83a31c73 100644
--- a/net/ipv4/netfilter/nf_nat_proto_gre.c
+++ b/net/ipv4/netfilter/nf_nat_proto_gre.c
@@ -37,7 +37,7 @@ MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
MODULE_DESCRIPTION("Netfilter NAT protocol helper module for GRE");
/* generate unique tuple ... */
-static bool
+static void
gre_unique_tuple(struct nf_conntrack_tuple *tuple,
const struct nf_nat_range *range,
enum nf_nat_manip_type maniptype,
@@ -50,7 +50,7 @@ gre_unique_tuple(struct nf_conntrack_tuple *tuple,
/* If there is no master conntrack we are not PPTP,
do not change tuples */
if (!ct->master)
- return false;
+ return;
if (maniptype == IP_NAT_MANIP_SRC)
keyptr = &tuple->src.u.gre.key;
@@ -68,14 +68,14 @@ gre_unique_tuple(struct nf_conntrack_tuple *tuple,
pr_debug("min = %u, range_size = %u\n", min, range_size);
- for (i = 0; i < range_size; i++, key++) {
+ for (i = 0; ; ++key) {
*keyptr = htons(min + key % range_size);
- if (!nf_nat_used_tuple(tuple, ct))
- return true;
+ if (++i == range_size || !nf_nat_used_tuple(tuple, ct))
+ return;
}
pr_debug("%p: no NAT mapping\n", ct);
- return false;
+ return;
}
/* manipulate a GRE packet according to maniptype */
diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c b/net/ipv4/netfilter/nf_nat_proto_icmp.c
index 19a8b0b07d8e..5744c3ec847c 100644
--- a/net/ipv4/netfilter/nf_nat_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_icmp.c
@@ -27,7 +27,7 @@ icmp_in_range(const struct nf_conntrack_tuple *tuple,
ntohs(tuple->src.u.icmp.id) <= ntohs(max->icmp.id);
}
-static bool
+static void
icmp_unique_tuple(struct nf_conntrack_tuple *tuple,
const struct nf_nat_range *range,
enum nf_nat_manip_type maniptype,
@@ -42,13 +42,13 @@ icmp_unique_tuple(struct nf_conntrack_tuple *tuple,
if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED))
range_size = 0xFFFF;
- for (i = 0; i < range_size; i++, id++) {
+ for (i = 0; ; ++id) {
tuple->src.u.icmp.id = htons(ntohs(range->min.icmp.id) +
(id % range_size));
- if (!nf_nat_used_tuple(tuple, ct))
- return true;
+ if (++i == range_size || !nf_nat_used_tuple(tuple, ct))
+ return;
}
- return false;
+ return;
}
static bool
diff --git a/net/ipv4/netfilter/nf_nat_proto_sctp.c b/net/ipv4/netfilter/nf_nat_proto_sctp.c
index 3fc598eeeb1a..756331d42661 100644
--- a/net/ipv4/netfilter/nf_nat_proto_sctp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_sctp.c
@@ -16,14 +16,14 @@
static u_int16_t nf_sctp_port_rover;
-static bool
+static void
sctp_unique_tuple(struct nf_conntrack_tuple *tuple,
const struct nf_nat_range *range,
enum nf_nat_manip_type maniptype,
const struct nf_conn *ct)
{
- return nf_nat_proto_unique_tuple(tuple, range, maniptype, ct,
- &nf_sctp_port_rover);
+ nf_nat_proto_unique_tuple(tuple, range, maniptype, ct,
+ &nf_sctp_port_rover);
}
static bool
diff --git a/net/ipv4/netfilter/nf_nat_proto_tcp.c b/net/ipv4/netfilter/nf_nat_proto_tcp.c
index 399e2cfa263b..aa460a595d5d 100644
--- a/net/ipv4/netfilter/nf_nat_proto_tcp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_tcp.c
@@ -20,14 +20,13 @@
static u_int16_t tcp_port_rover;
-static bool
+static void
tcp_unique_tuple(struct nf_conntrack_tuple *tuple,
const struct nf_nat_range *range,
enum nf_nat_manip_type maniptype,
const struct nf_conn *ct)
{
- return nf_nat_proto_unique_tuple(tuple, range, maniptype, ct,
- &tcp_port_rover);
+ nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, &tcp_port_rover);
}
static bool
diff --git a/net/ipv4/netfilter/nf_nat_proto_udp.c b/net/ipv4/netfilter/nf_nat_proto_udp.c
index 9e61c79492e4..dfe65c7e2925 100644
--- a/net/ipv4/netfilter/nf_nat_proto_udp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_udp.c
@@ -19,14 +19,13 @@
static u_int16_t udp_port_rover;
-static bool
+static void
udp_unique_tuple(struct nf_conntrack_tuple *tuple,
const struct nf_nat_range *range,
enum nf_nat_manip_type maniptype,
const struct nf_conn *ct)
{
- return nf_nat_proto_unique_tuple(tuple, range, maniptype, ct,
- &udp_port_rover);
+ nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, &udp_port_rover);
}
static bool
diff --git a/net/ipv4/netfilter/nf_nat_proto_udplite.c b/net/ipv4/netfilter/nf_nat_proto_udplite.c
index 440a229bbd87..3cc8c8af39ef 100644
--- a/net/ipv4/netfilter/nf_nat_proto_udplite.c
+++ b/net/ipv4/netfilter/nf_nat_proto_udplite.c
@@ -18,14 +18,14 @@
static u_int16_t udplite_port_rover;
-static bool
+static void
udplite_unique_tuple(struct nf_conntrack_tuple *tuple,
const struct nf_nat_range *range,
enum nf_nat_manip_type maniptype,
const struct nf_conn *ct)
{
- return nf_nat_proto_unique_tuple(tuple, range, maniptype, ct,
- &udplite_port_rover);
+ nf_nat_proto_unique_tuple(tuple, range, maniptype, ct,
+ &udplite_port_rover);
}
static bool
diff --git a/net/ipv4/netfilter/nf_nat_proto_unknown.c b/net/ipv4/netfilter/nf_nat_proto_unknown.c
index 14381c62acea..a50f2bc1c732 100644
--- a/net/ipv4/netfilter/nf_nat_proto_unknown.c
+++ b/net/ipv4/netfilter/nf_nat_proto_unknown.c
@@ -26,14 +26,14 @@ static bool unknown_in_range(const struct nf_conntrack_tuple *tuple,
return true;
}
-static bool unknown_unique_tuple(struct nf_conntrack_tuple *tuple,
+static void unknown_unique_tuple(struct nf_conntrack_tuple *tuple,
const struct nf_nat_range *range,
enum nf_nat_manip_type maniptype,
const struct nf_conn *ct)
{
/* Sorry: we can't help you; if it's not unique, we can't frob
anything. */
- return false;
+ return;
}
static bool
diff --git a/net/ipv4/netfilter/nf_nat_rule.c b/net/ipv4/netfilter/nf_nat_rule.c
index 98ed78281aee..21c30426480b 100644
--- a/net/ipv4/netfilter/nf_nat_rule.c
+++ b/net/ipv4/netfilter/nf_nat_rule.c
@@ -28,7 +28,8 @@
#define NAT_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | \
(1 << NF_INET_POST_ROUTING) | \
- (1 << NF_INET_LOCAL_OUT))
+ (1 << NF_INET_LOCAL_OUT) | \
+ (1 << NF_INET_LOCAL_IN))
static const struct xt_table nat_table = {
.name = "nat",
@@ -45,7 +46,8 @@ ipt_snat_target(struct sk_buff *skb, const struct xt_action_param *par)
enum ip_conntrack_info ctinfo;
const struct nf_nat_multi_range_compat *mr = par->targinfo;
- NF_CT_ASSERT(par->hooknum == NF_INET_POST_ROUTING);
+ NF_CT_ASSERT(par->hooknum == NF_INET_POST_ROUTING ||
+ par->hooknum == NF_INET_LOCAL_IN);
ct = nf_ct_get(skb, &ctinfo);
@@ -99,21 +101,20 @@ static int ipt_dnat_checkentry(const struct xt_tgchk_param *par)
return 0;
}
-unsigned int
+static unsigned int
alloc_null_binding(struct nf_conn *ct, unsigned int hooknum)
{
/* Force range to this IP; let proto decide mapping for
per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED).
- Use reply in case it's already been mangled (eg local packet).
*/
- __be32 ip
- = (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC
- ? ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip
- : ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip);
- struct nf_nat_range range
- = { IP_NAT_RANGE_MAP_IPS, ip, ip, { 0 }, { 0 } };
-
- pr_debug("Allocating NULL binding for %p (%pI4)\n", ct, &ip);
+ struct nf_nat_range range;
+
+ range.flags = 0;
+ pr_debug("Allocating NULL binding for %p (%pI4)\n", ct,
+ HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC ?
+ &ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip :
+ &ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip);
+
return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum));
}
@@ -141,7 +142,7 @@ static struct xt_target ipt_snat_reg __read_mostly = {
.target = ipt_snat_target,
.targetsize = sizeof(struct nf_nat_multi_range_compat),
.table = "nat",
- .hooks = 1 << NF_INET_POST_ROUTING,
+ .hooks = (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_IN),
.checkentry = ipt_snat_checkentry,
.family = AF_INET,
};
diff --git a/net/ipv4/netfilter/nf_nat_sip.c b/net/ipv4/netfilter/nf_nat_sip.c
index 11b538deaaec..e40cf7816fdb 100644
--- a/net/ipv4/netfilter/nf_nat_sip.c
+++ b/net/ipv4/netfilter/nf_nat_sip.c
@@ -307,9 +307,16 @@ static unsigned int ip_nat_sip_expect(struct sk_buff *skb, unsigned int dataoff,
exp->expectfn = ip_nat_sip_expected;
for (; port != 0; port++) {
+ int ret;
+
exp->tuple.dst.u.udp.port = htons(port);
- if (nf_ct_expect_related(exp) == 0)
+ ret = nf_ct_expect_related(exp);
+ if (ret == 0)
+ break;
+ else if (ret != -EBUSY) {
+ port = 0;
break;
+ }
}
if (port == 0)
@@ -480,13 +487,25 @@ static unsigned int ip_nat_sdp_media(struct sk_buff *skb, unsigned int dataoff,
/* Try to get same pair of ports: if not, try to change them. */
for (port = ntohs(rtp_exp->tuple.dst.u.udp.port);
port != 0; port += 2) {
+ int ret;
+
rtp_exp->tuple.dst.u.udp.port = htons(port);
- if (nf_ct_expect_related(rtp_exp) != 0)
+ ret = nf_ct_expect_related(rtp_exp);
+ if (ret == -EBUSY)
continue;
+ else if (ret < 0) {
+ port = 0;
+ break;
+ }
rtcp_exp->tuple.dst.u.udp.port = htons(port + 1);
- if (nf_ct_expect_related(rtcp_exp) == 0)
+ ret = nf_ct_expect_related(rtcp_exp);
+ if (ret == 0)
break;
- nf_ct_unexpect_related(rtp_exp);
+ else if (ret != -EBUSY) {
+ nf_ct_unexpect_related(rtp_exp);
+ port = 0;
+ break;
+ }
}
if (port == 0)
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c
index 1679e2c0963d..ee5f419d0a56 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c
@@ -893,13 +893,15 @@ static void fast_csum(__sum16 *csum,
unsigned char s[4];
if (offset & 1) {
- s[0] = s[2] = 0;
+ s[0] = ~0;
s[1] = ~*optr;
+ s[2] = 0;
s[3] = *nptr;
} else {
- s[1] = s[3] = 0;
s[0] = ~*optr;
+ s[1] = ~0;
s[2] = *nptr;
+ s[3] = 0;
}
*csum = csum_fold(csum_partial(s, 4, ~csum_unfold(*csum)));
diff --git a/net/ipv4/netfilter/nf_nat_standalone.c b/net/ipv4/netfilter/nf_nat_standalone.c
index beb25819c9c9..95481fee8bdb 100644
--- a/net/ipv4/netfilter/nf_nat_standalone.c
+++ b/net/ipv4/netfilter/nf_nat_standalone.c
@@ -98,7 +98,7 @@ nf_nat_fn(unsigned int hooknum,
return NF_ACCEPT;
/* Don't try to NAT if this packet is not conntracked */
- if (ct == &nf_conntrack_untracked)
+ if (nf_ct_is_untracked(ct))
return NF_ACCEPT;
nat = nfct_nat(ct);
@@ -131,13 +131,7 @@ nf_nat_fn(unsigned int hooknum,
if (!nf_nat_initialized(ct, maniptype)) {
unsigned int ret;
- if (hooknum == NF_INET_LOCAL_IN)
- /* LOCAL_IN hook doesn't have a chain! */
- ret = alloc_null_binding(ct, hooknum);
- else
- ret = nf_nat_rule_find(skb, hooknum, in, out,
- ct);
-
+ ret = nf_nat_rule_find(skb, hooknum, in, out, ct);
if (ret != NF_ACCEPT)
return ret;
} else
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 3dc9914c1dce..4ae1f203f7cb 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -252,6 +252,7 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPBacklogDrop", LINUX_MIB_TCPBACKLOGDROP),
SNMP_MIB_ITEM("TCPMinTTLDrop", LINUX_MIB_TCPMINTTLDROP),
SNMP_MIB_ITEM("TCPDeferAcceptDrop", LINUX_MIB_TCPDEFERACCEPTDROP),
+ SNMP_MIB_ITEM("IPReversePathFilter", LINUX_MIB_IPRPFILTER),
SNMP_MIB_SENTINEL
};
@@ -342,10 +343,12 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
IPV4_DEVCONF_ALL(net, FORWARDING) ? 1 : 2,
sysctl_ip_default_ttl);
+ BUILD_BUG_ON(offsetof(struct ipstats_mib, mibs) != 0);
for (i = 0; snmp4_ipstats_list[i].name != NULL; i++)
- seq_printf(seq, " %lu",
- snmp_fold_field((void __percpu **)net->mib.ip_statistics,
- snmp4_ipstats_list[i].entry));
+ seq_printf(seq, " %llu",
+ snmp_fold_field64((void __percpu **)net->mib.ip_statistics,
+ snmp4_ipstats_list[i].entry,
+ offsetof(struct ipstats_mib, syncp)));
icmp_put(seq); /* RFC 2011 compatibility */
icmpmsg_put(seq);
@@ -431,9 +434,10 @@ static int netstat_seq_show(struct seq_file *seq, void *v)
seq_puts(seq, "\nIpExt:");
for (i = 0; snmp4_ipextstats_list[i].name != NULL; i++)
- seq_printf(seq, " %lu",
- snmp_fold_field((void __percpu **)net->mib.ip_statistics,
- snmp4_ipextstats_list[i].entry));
+ seq_printf(seq, " %llu",
+ snmp_fold_field64((void __percpu **)net->mib.ip_statistics,
+ snmp4_ipextstats_list[i].entry,
+ offsetof(struct ipstats_mib, syncp)));
seq_putc(seq, '\n');
return 0;
diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c
index 542f22fc98b3..9ae5c01cd0b2 100644
--- a/net/ipv4/protocol.c
+++ b/net/ipv4/protocol.c
@@ -28,8 +28,7 @@
#include <linux/spinlock.h>
#include <net/protocol.h>
-const struct net_protocol *inet_protos[MAX_INET_PROTOS] ____cacheline_aligned_in_smp;
-static DEFINE_SPINLOCK(inet_proto_lock);
+const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly;
/*
* Add a protocol handler to the hash tables
@@ -37,21 +36,12 @@ static DEFINE_SPINLOCK(inet_proto_lock);
int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol)
{
- int hash, ret;
+ int hash = protocol & (MAX_INET_PROTOS - 1);
- hash = protocol & (MAX_INET_PROTOS - 1);
-
- spin_lock_bh(&inet_proto_lock);
- if (inet_protos[hash]) {
- ret = -1;
- } else {
- inet_protos[hash] = prot;
- ret = 0;
- }
- spin_unlock_bh(&inet_proto_lock);
-
- return ret;
+ return !cmpxchg((const struct net_protocol **)&inet_protos[hash],
+ NULL, prot) ? 0 : -1;
}
+EXPORT_SYMBOL(inet_add_protocol);
/*
* Remove a protocol from the hash tables.
@@ -59,23 +49,13 @@ int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol)
int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol)
{
- int hash, ret;
+ int ret, hash = protocol & (MAX_INET_PROTOS - 1);
- hash = protocol & (MAX_INET_PROTOS - 1);
-
- spin_lock_bh(&inet_proto_lock);
- if (inet_protos[hash] == prot) {
- inet_protos[hash] = NULL;
- ret = 0;
- } else {
- ret = -1;
- }
- spin_unlock_bh(&inet_proto_lock);
+ ret = (cmpxchg((const struct net_protocol **)&inet_protos[hash],
+ prot, NULL) == prot) ? 0 : -1;
synchronize_net();
return ret;
}
-
-EXPORT_SYMBOL(inet_add_protocol);
EXPORT_SYMBOL(inet_del_protocol);
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 2c7a1639388a..1f85ef289895 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -314,7 +314,7 @@ int raw_rcv(struct sock *sk, struct sk_buff *skb)
}
static int raw_send_hdrinc(struct sock *sk, void *from, size_t length,
- struct rtable *rt,
+ struct rtable **rtp,
unsigned int flags)
{
struct inet_sock *inet = inet_sk(sk);
@@ -323,25 +323,27 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length,
struct sk_buff *skb;
unsigned int iphlen;
int err;
+ struct rtable *rt = *rtp;
- if (length > rt->u.dst.dev->mtu) {
+ if (length > rt->dst.dev->mtu) {
ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport,
- rt->u.dst.dev->mtu);
+ rt->dst.dev->mtu);
return -EMSGSIZE;
}
if (flags&MSG_PROBE)
goto out;
skb = sock_alloc_send_skb(sk,
- length + LL_ALLOCATED_SPACE(rt->u.dst.dev) + 15,
+ length + LL_ALLOCATED_SPACE(rt->dst.dev) + 15,
flags & MSG_DONTWAIT, &err);
if (skb == NULL)
goto error;
- skb_reserve(skb, LL_RESERVED_SPACE(rt->u.dst.dev));
+ skb_reserve(skb, LL_RESERVED_SPACE(rt->dst.dev));
skb->priority = sk->sk_priority;
skb->mark = sk->sk_mark;
- skb_dst_set(skb, dst_clone(&rt->u.dst));
+ skb_dst_set(skb, &rt->dst);
+ *rtp = NULL;
skb_reset_network_header(skb);
iph = ip_hdr(skb);
@@ -373,7 +375,7 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length,
iph->check = 0;
iph->tot_len = htons(length);
if (!iph->id)
- ip_select_ident(iph, &rt->u.dst, NULL);
+ ip_select_ident(iph, &rt->dst, NULL);
iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
}
@@ -382,7 +384,7 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length,
skb_transport_header(skb))->type);
err = NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
- rt->u.dst.dev, dst_output);
+ rt->dst.dev, dst_output);
if (err > 0)
err = net_xmit_errno(err);
if (err)
@@ -503,7 +505,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
ipc.addr = inet->inet_saddr;
ipc.opt = NULL;
- ipc.shtx.flags = 0;
+ ipc.tx_flags = 0;
ipc.oif = sk->sk_bound_dev_if;
if (msg->msg_controllen) {
@@ -576,7 +578,7 @@ back_from_confirm:
if (inet->hdrincl)
err = raw_send_hdrinc(sk, msg->msg_iov, len,
- rt, msg->msg_flags);
+ &rt, msg->msg_flags);
else {
if (!ipc.addr)
@@ -604,7 +606,7 @@ out:
return len;
do_confirm:
- dst_confirm(&rt->u.dst);
+ dst_confirm(&rt->dst);
if (!(msg->msg_flags & MSG_PROBE) || len)
goto back_from_confirm;
err = 0;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 560acc677ce4..987bf9adb318 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -159,7 +159,6 @@ static struct dst_ops ipv4_dst_ops = {
.link_failure = ipv4_link_failure,
.update_pmtu = ip_rt_update_pmtu,
.local_out = __ip_local_out,
- .entries = ATOMIC_INIT(0),
};
#define ECN_OR_COST(class) TC_PRIO_##class
@@ -199,7 +198,7 @@ const __u8 ip_tos2prio[16] = {
*/
struct rt_hash_bucket {
- struct rtable *chain;
+ struct rtable __rcu *chain;
};
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
@@ -253,8 +252,7 @@ static unsigned rt_hash_mask __read_mostly;
static unsigned int rt_hash_log __read_mostly;
static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
-#define RT_CACHE_STAT_INC(field) \
- (__raw_get_cpu_var(rt_cache_stat).field++)
+#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
int genid)
@@ -282,15 +280,15 @@ static struct rtable *rt_cache_get_first(struct seq_file *seq)
struct rtable *r = NULL;
for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
- if (!rt_hash_table[st->bucket].chain)
+ if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain))
continue;
rcu_read_lock_bh();
r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
while (r) {
- if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
+ if (dev_net(r->dst.dev) == seq_file_net(seq) &&
r->rt_genid == st->genid)
return r;
- r = rcu_dereference_bh(r->u.dst.rt_next);
+ r = rcu_dereference_bh(r->dst.rt_next);
}
rcu_read_unlock_bh();
}
@@ -302,17 +300,17 @@ static struct rtable *__rt_cache_get_next(struct seq_file *seq,
{
struct rt_cache_iter_state *st = seq->private;
- r = r->u.dst.rt_next;
+ r = rcu_dereference_bh(r->dst.rt_next);
while (!r) {
rcu_read_unlock_bh();
do {
if (--st->bucket < 0)
return NULL;
- } while (!rt_hash_table[st->bucket].chain);
+ } while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain));
rcu_read_lock_bh();
- r = rt_hash_table[st->bucket].chain;
+ r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
}
- return rcu_dereference_bh(r);
+ return r;
}
static struct rtable *rt_cache_get_next(struct seq_file *seq,
@@ -320,7 +318,7 @@ static struct rtable *rt_cache_get_next(struct seq_file *seq,
{
struct rt_cache_iter_state *st = seq->private;
while ((r = __rt_cache_get_next(seq, r)) != NULL) {
- if (dev_net(r->u.dst.dev) != seq_file_net(seq))
+ if (dev_net(r->dst.dev) != seq_file_net(seq))
continue;
if (r->rt_genid == st->genid)
break;
@@ -378,19 +376,19 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v)
seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
"%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
- r->u.dst.dev ? r->u.dst.dev->name : "*",
+ r->dst.dev ? r->dst.dev->name : "*",
(__force u32)r->rt_dst,
(__force u32)r->rt_gateway,
- r->rt_flags, atomic_read(&r->u.dst.__refcnt),
- r->u.dst.__use, 0, (__force u32)r->rt_src,
- (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
- (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
- dst_metric(&r->u.dst, RTAX_WINDOW),
- (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
- dst_metric(&r->u.dst, RTAX_RTTVAR)),
+ r->rt_flags, atomic_read(&r->dst.__refcnt),
+ r->dst.__use, 0, (__force u32)r->rt_src,
+ (dst_metric(&r->dst, RTAX_ADVMSS) ?
+ (int)dst_metric(&r->dst, RTAX_ADVMSS) + 40 : 0),
+ dst_metric(&r->dst, RTAX_WINDOW),
+ (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
+ dst_metric(&r->dst, RTAX_RTTVAR)),
r->fl.fl4_tos,
- r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
- r->u.dst.hh ? (r->u.dst.hh->hh_output ==
+ r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1,
+ r->dst.hh ? (r->dst.hh->hh_output ==
dev_queue_xmit) : 0,
r->rt_spec_dst, &len);
@@ -467,7 +465,7 @@ static int rt_cpu_seq_show(struct seq_file *seq, void *v)
seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
" %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
- atomic_read(&ipv4_dst_ops.entries),
+ dst_entries_get_slow(&ipv4_dst_ops),
st->in_hit,
st->in_slow_tot,
st->in_slow_mc,
@@ -609,13 +607,13 @@ static inline int ip_rt_proc_init(void)
static inline void rt_free(struct rtable *rt)
{
- call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
+ call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
}
static inline void rt_drop(struct rtable *rt)
{
ip_rt_put(rt);
- call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
+ call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
}
static inline int rt_fast_clean(struct rtable *rth)
@@ -623,13 +621,13 @@ static inline int rt_fast_clean(struct rtable *rth)
/* Kill broadcast/multicast entries very aggresively, if they
collide in hash table with more useful entries */
return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
- rth->fl.iif && rth->u.dst.rt_next;
+ rth->fl.iif && rth->dst.rt_next;
}
static inline int rt_valuable(struct rtable *rth)
{
return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
- rth->u.dst.expires;
+ rth->dst.expires;
}
static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
@@ -637,15 +635,15 @@ static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long t
unsigned long age;
int ret = 0;
- if (atomic_read(&rth->u.dst.__refcnt))
+ if (atomic_read(&rth->dst.__refcnt))
goto out;
ret = 1;
- if (rth->u.dst.expires &&
- time_after_eq(jiffies, rth->u.dst.expires))
+ if (rth->dst.expires &&
+ time_after_eq(jiffies, rth->dst.expires))
goto out;
- age = jiffies - rth->u.dst.lastuse;
+ age = jiffies - rth->dst.lastuse;
ret = 0;
if ((age <= tmo1 && !rt_fast_clean(rth)) ||
(age <= tmo2 && rt_valuable(rth)))
@@ -661,7 +659,7 @@ out: return ret;
*/
static inline u32 rt_score(struct rtable *rt)
{
- u32 score = jiffies - rt->u.dst.lastuse;
+ u32 score = jiffies - rt->dst.lastuse;
score = ~score & ~(3<<30);
@@ -701,12 +699,12 @@ static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
{
- return net_eq(dev_net(rt1->u.dst.dev), dev_net(rt2->u.dst.dev));
+ return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
}
static inline int rt_is_expired(struct rtable *rth)
{
- return rth->rt_genid != rt_genid(dev_net(rth->u.dst.dev));
+ return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
}
/*
@@ -723,19 +721,23 @@ static void rt_do_flush(int process_context)
for (i = 0; i <= rt_hash_mask; i++) {
if (process_context && need_resched())
cond_resched();
- rth = rt_hash_table[i].chain;
+ rth = rcu_dereference_raw(rt_hash_table[i].chain);
if (!rth)
continue;
spin_lock_bh(rt_hash_lock_addr(i));
#ifdef CONFIG_NET_NS
{
- struct rtable ** prev, * p;
+ struct rtable __rcu **prev;
+ struct rtable *p;
- rth = rt_hash_table[i].chain;
+ rth = rcu_dereference_protected(rt_hash_table[i].chain,
+ lockdep_is_held(rt_hash_lock_addr(i)));
/* defer releasing the head of the list after spin_unlock */
- for (tail = rth; tail; tail = tail->u.dst.rt_next)
+ for (tail = rth; tail;
+ tail = rcu_dereference_protected(tail->dst.rt_next,
+ lockdep_is_held(rt_hash_lock_addr(i))))
if (!rt_is_expired(tail))
break;
if (rth != tail)
@@ -743,10 +745,14 @@ static void rt_do_flush(int process_context)
/* call rt_free on entries after the tail requiring flush */
prev = &rt_hash_table[i].chain;
- for (p = *prev; p; p = next) {
- next = p->u.dst.rt_next;
+ for (p = rcu_dereference_protected(*prev,
+ lockdep_is_held(rt_hash_lock_addr(i)));
+ p != NULL;
+ p = next) {
+ next = rcu_dereference_protected(p->dst.rt_next,
+ lockdep_is_held(rt_hash_lock_addr(i)));
if (!rt_is_expired(p)) {
- prev = &p->u.dst.rt_next;
+ prev = &p->dst.rt_next;
} else {
*prev = next;
rt_free(p);
@@ -754,14 +760,15 @@ static void rt_do_flush(int process_context)
}
}
#else
- rth = rt_hash_table[i].chain;
- rt_hash_table[i].chain = NULL;
+ rth = rcu_dereference_protected(rt_hash_table[i].chain,
+ lockdep_is_held(rt_hash_lock_addr(i)));
+ rcu_assign_pointer(rt_hash_table[i].chain, NULL);
tail = NULL;
#endif
spin_unlock_bh(rt_hash_lock_addr(i));
for (; rth != tail; rth = next) {
- next = rth->u.dst.rt_next;
+ next = rcu_dereference_protected(rth->dst.rt_next, 1);
rt_free(rth);
}
}
@@ -792,7 +799,7 @@ static int has_noalias(const struct rtable *head, const struct rtable *rth)
while (aux != rth) {
if (compare_hash_inputs(&aux->fl, &rth->fl))
return 0;
- aux = aux->u.dst.rt_next;
+ aux = rcu_dereference_protected(aux->dst.rt_next, 1);
}
return ONE;
}
@@ -801,7 +808,8 @@ static void rt_check_expire(void)
{
static unsigned int rover;
unsigned int i = rover, goal;
- struct rtable *rth, **rthp;
+ struct rtable *rth;
+ struct rtable __rcu **rthp;
unsigned long samples = 0;
unsigned long sum = 0, sum2 = 0;
unsigned long delta;
@@ -827,23 +835,24 @@ static void rt_check_expire(void)
samples++;
- if (*rthp == NULL)
+ if (rcu_dereference_raw(*rthp) == NULL)
continue;
length = 0;
spin_lock_bh(rt_hash_lock_addr(i));
- while ((rth = *rthp) != NULL) {
- prefetch(rth->u.dst.rt_next);
+ while ((rth = rcu_dereference_protected(*rthp,
+ lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
+ prefetch(rth->dst.rt_next);
if (rt_is_expired(rth)) {
- *rthp = rth->u.dst.rt_next;
+ *rthp = rth->dst.rt_next;
rt_free(rth);
continue;
}
- if (rth->u.dst.expires) {
+ if (rth->dst.expires) {
/* Entry is expired even if it is in use */
- if (time_before_eq(jiffies, rth->u.dst.expires)) {
+ if (time_before_eq(jiffies, rth->dst.expires)) {
nofree:
tmo >>= 1;
- rthp = &rth->u.dst.rt_next;
+ rthp = &rth->dst.rt_next;
/*
* We only count entries on
* a chain with equal hash inputs once
@@ -859,7 +868,7 @@ nofree:
goto nofree;
/* Cleanup aged off entries. */
- *rthp = rth->u.dst.rt_next;
+ *rthp = rth->dst.rt_next;
rt_free(rth);
}
spin_unlock_bh(rt_hash_lock_addr(i));
@@ -943,9 +952,11 @@ static int rt_garbage_collect(struct dst_ops *ops)
static unsigned long last_gc;
static int rover;
static int equilibrium;
- struct rtable *rth, **rthp;
+ struct rtable *rth;
+ struct rtable __rcu **rthp;
unsigned long now = jiffies;
int goal;
+ int entries = dst_entries_get_fast(&ipv4_dst_ops);
/*
* Garbage collection is pretty expensive,
@@ -955,28 +966,28 @@ static int rt_garbage_collect(struct dst_ops *ops)
RT_CACHE_STAT_INC(gc_total);
if (now - last_gc < ip_rt_gc_min_interval &&
- atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
+ entries < ip_rt_max_size) {
RT_CACHE_STAT_INC(gc_ignored);
goto out;
}
+ entries = dst_entries_get_slow(&ipv4_dst_ops);
/* Calculate number of entries, which we want to expire now. */
- goal = atomic_read(&ipv4_dst_ops.entries) -
- (ip_rt_gc_elasticity << rt_hash_log);
+ goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
if (goal <= 0) {
if (equilibrium < ipv4_dst_ops.gc_thresh)
equilibrium = ipv4_dst_ops.gc_thresh;
- goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
+ goal = entries - equilibrium;
if (goal > 0) {
equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
- goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
+ goal = entries - equilibrium;
}
} else {
/* We are in dangerous area. Try to reduce cache really
* aggressively.
*/
goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
- equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
+ equilibrium = entries - goal;
}
if (now - last_gc >= ip_rt_gc_min_interval)
@@ -996,14 +1007,15 @@ static int rt_garbage_collect(struct dst_ops *ops)
k = (k + 1) & rt_hash_mask;
rthp = &rt_hash_table[k].chain;
spin_lock_bh(rt_hash_lock_addr(k));
- while ((rth = *rthp) != NULL) {
+ while ((rth = rcu_dereference_protected(*rthp,
+ lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
if (!rt_is_expired(rth) &&
!rt_may_expire(rth, tmo, expire)) {
tmo >>= 1;
- rthp = &rth->u.dst.rt_next;
+ rthp = &rth->dst.rt_next;
continue;
}
- *rthp = rth->u.dst.rt_next;
+ *rthp = rth->dst.rt_next;
rt_free(rth);
goal--;
}
@@ -1033,14 +1045,16 @@ static int rt_garbage_collect(struct dst_ops *ops)
expire >>= 1;
#if RT_CACHE_DEBUG >= 2
printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
- atomic_read(&ipv4_dst_ops.entries), goal, i);
+ dst_entries_get_fast(&ipv4_dst_ops), goal, i);
#endif
- if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
+ if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
goto out;
} while (!in_softirq() && time_before_eq(jiffies, now));
- if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
+ if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
+ goto out;
+ if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
goto out;
if (net_ratelimit())
printk(KERN_WARNING "dst cache overflow\n");
@@ -1050,11 +1064,12 @@ static int rt_garbage_collect(struct dst_ops *ops)
work_done:
expire += ip_rt_gc_min_interval;
if (expire > ip_rt_gc_timeout ||
- atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
+ dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
+ dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
expire = ip_rt_gc_timeout;
#if RT_CACHE_DEBUG >= 2
printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
- atomic_read(&ipv4_dst_ops.entries), goal, rover);
+ dst_entries_get_fast(&ipv4_dst_ops), goal, rover);
#endif
out: return 0;
}
@@ -1069,7 +1084,7 @@ static int slow_chain_length(const struct rtable *head)
while (rth) {
length += has_noalias(head, rth);
- rth = rth->u.dst.rt_next;
+ rth = rcu_dereference_protected(rth->dst.rt_next, 1);
}
return length >> FRACT_BITS;
}
@@ -1077,9 +1092,9 @@ static int slow_chain_length(const struct rtable *head)
static int rt_intern_hash(unsigned hash, struct rtable *rt,
struct rtable **rp, struct sk_buff *skb, int ifindex)
{
- struct rtable *rth, **rthp;
+ struct rtable *rth, *cand;
+ struct rtable __rcu **rthp, **candp;
unsigned long now;
- struct rtable *cand, **candp;
u32 min_score;
int chain_length;
int attempts = !in_softirq();
@@ -1091,7 +1106,7 @@ restart:
candp = NULL;
now = jiffies;
- if (!rt_caching(dev_net(rt->u.dst.dev))) {
+ if (!rt_caching(dev_net(rt->dst.dev))) {
/*
* If we're not caching, just tell the caller we
* were successful and don't touch the route. The
@@ -1103,44 +1118,45 @@ restart:
* Note that we do rt_free on this new route entry, so that
* once its refcount hits zero, we are still able to reap it
* (Thanks Alexey)
- * Note also the rt_free uses call_rcu. We don't actually
- * need rcu protection here, this is just our path to get
- * on the route gc list.
+ * Note: To avoid expensive rcu stuff for this uncached dst,
+ * we set DST_NOCACHE so that dst_release() can free dst without
+ * waiting a grace period.
*/
+ rt->dst.flags |= DST_NOCACHE;
if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
- int err = arp_bind_neighbour(&rt->u.dst);
+ int err = arp_bind_neighbour(&rt->dst);
if (err) {
if (net_ratelimit())
printk(KERN_WARNING
"Neighbour table failure & not caching routes.\n");
- rt_drop(rt);
+ ip_rt_put(rt);
return err;
}
}
- rt_free(rt);
goto skip_hashing;
}
rthp = &rt_hash_table[hash].chain;
spin_lock_bh(rt_hash_lock_addr(hash));
- while ((rth = *rthp) != NULL) {
+ while ((rth = rcu_dereference_protected(*rthp,
+ lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
if (rt_is_expired(rth)) {
- *rthp = rth->u.dst.rt_next;
+ *rthp = rth->dst.rt_next;
rt_free(rth);
continue;
}
if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
/* Put it first */
- *rthp = rth->u.dst.rt_next;
+ *rthp = rth->dst.rt_next;
/*
* Since lookup is lockfree, the deletion
* must be visible to another weakly ordered CPU before
* the insertion at the start of the hash chain.
*/
- rcu_assign_pointer(rth->u.dst.rt_next,
+ rcu_assign_pointer(rth->dst.rt_next,
rt_hash_table[hash].chain);
/*
* Since lookup is lockfree, the update writes
@@ -1148,18 +1164,18 @@ restart:
*/
rcu_assign_pointer(rt_hash_table[hash].chain, rth);
- dst_use(&rth->u.dst, now);
+ dst_use(&rth->dst, now);
spin_unlock_bh(rt_hash_lock_addr(hash));
rt_drop(rt);
if (rp)
*rp = rth;
else
- skb_dst_set(skb, &rth->u.dst);
+ skb_dst_set(skb, &rth->dst);
return 0;
}
- if (!atomic_read(&rth->u.dst.__refcnt)) {
+ if (!atomic_read(&rth->dst.__refcnt)) {
u32 score = rt_score(rth);
if (score <= min_score) {
@@ -1171,7 +1187,7 @@ restart:
chain_length++;
- rthp = &rth->u.dst.rt_next;
+ rthp = &rth->dst.rt_next;
}
if (cand) {
@@ -1182,17 +1198,17 @@ restart:
* only 2 entries per bucket. We will see.
*/
if (chain_length > ip_rt_gc_elasticity) {
- *candp = cand->u.dst.rt_next;
+ *candp = cand->dst.rt_next;
rt_free(cand);
}
} else {
if (chain_length > rt_chain_length_max &&
slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
- struct net *net = dev_net(rt->u.dst.dev);
+ struct net *net = dev_net(rt->dst.dev);
int num = ++net->ipv4.current_rt_cache_rebuild_count;
if (!rt_caching(net)) {
printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
- rt->u.dst.dev->name, num);
+ rt->dst.dev->name, num);
}
rt_emergency_hash_rebuild(net);
spin_unlock_bh(rt_hash_lock_addr(hash));
@@ -1207,7 +1223,7 @@ restart:
route or unicast forwarding path.
*/
if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
- int err = arp_bind_neighbour(&rt->u.dst);
+ int err = arp_bind_neighbour(&rt->dst);
if (err) {
spin_unlock_bh(rt_hash_lock_addr(hash));
@@ -1232,20 +1248,20 @@ restart:
}
if (net_ratelimit())
- printk(KERN_WARNING "Neighbour table overflow.\n");
+ printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
rt_drop(rt);
return -ENOBUFS;
}
}
- rt->u.dst.rt_next = rt_hash_table[hash].chain;
+ rt->dst.rt_next = rt_hash_table[hash].chain;
#if RT_CACHE_DEBUG >= 2
- if (rt->u.dst.rt_next) {
+ if (rt->dst.rt_next) {
struct rtable *trt;
printk(KERN_DEBUG "rt_cache @%02x: %pI4",
hash, &rt->rt_dst);
- for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
+ for (trt = rt->dst.rt_next; trt; trt = trt->dst.rt_next)
printk(" . %pI4", &trt->rt_dst);
printk("\n");
}
@@ -1263,24 +1279,17 @@ skip_hashing:
if (rp)
*rp = rt;
else
- skb_dst_set(skb, &rt->u.dst);
+ skb_dst_set(skb, &rt->dst);
return 0;
}
void rt_bind_peer(struct rtable *rt, int create)
{
- static DEFINE_SPINLOCK(rt_peer_lock);
struct inet_peer *peer;
peer = inet_getpeer(rt->rt_dst, create);
- spin_lock_bh(&rt_peer_lock);
- if (rt->peer == NULL) {
- rt->peer = peer;
- peer = NULL;
- }
- spin_unlock_bh(&rt_peer_lock);
- if (peer)
+ if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
inet_putpeer(peer);
}
@@ -1325,31 +1334,36 @@ void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
ip_select_fb_ident(iph);
}
+EXPORT_SYMBOL(__ip_select_ident);
static void rt_del(unsigned hash, struct rtable *rt)
{
- struct rtable **rthp, *aux;
+ struct rtable __rcu **rthp;
+ struct rtable *aux;
rthp = &rt_hash_table[hash].chain;
spin_lock_bh(rt_hash_lock_addr(hash));
ip_rt_put(rt);
- while ((aux = *rthp) != NULL) {
+ while ((aux = rcu_dereference_protected(*rthp,
+ lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
if (aux == rt || rt_is_expired(aux)) {
- *rthp = aux->u.dst.rt_next;
+ *rthp = aux->dst.rt_next;
rt_free(aux);
continue;
}
- rthp = &aux->u.dst.rt_next;
+ rthp = &aux->dst.rt_next;
}
spin_unlock_bh(rt_hash_lock_addr(hash));
}
+/* called in rcu_read_lock() section */
void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
__be32 saddr, struct net_device *dev)
{
int i, k;
- struct in_device *in_dev = in_dev_get(dev);
- struct rtable *rth, **rthp;
+ struct in_device *in_dev = __in_dev_get_rcu(dev);
+ struct rtable *rth;
+ struct rtable __rcu **rthp;
__be32 skeys[2] = { saddr, 0 };
int ikeys[2] = { dev->ifindex, 0 };
struct netevent_redirect netevent;
@@ -1382,9 +1396,8 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
rt_genid(net));
- rthp=&rt_hash_table[hash].chain;
+ rthp = &rt_hash_table[hash].chain;
- rcu_read_lock();
while ((rth = rcu_dereference(*rthp)) != NULL) {
struct rtable *rt;
@@ -1393,44 +1406,42 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
rth->fl.oif != ikeys[k] ||
rth->fl.iif != 0 ||
rt_is_expired(rth) ||
- !net_eq(dev_net(rth->u.dst.dev), net)) {
- rthp = &rth->u.dst.rt_next;
+ !net_eq(dev_net(rth->dst.dev), net)) {
+ rthp = &rth->dst.rt_next;
continue;
}
if (rth->rt_dst != daddr ||
rth->rt_src != saddr ||
- rth->u.dst.error ||
+ rth->dst.error ||
rth->rt_gateway != old_gw ||
- rth->u.dst.dev != dev)
+ rth->dst.dev != dev)
break;
- dst_hold(&rth->u.dst);
- rcu_read_unlock();
+ dst_hold(&rth->dst);
rt = dst_alloc(&ipv4_dst_ops);
if (rt == NULL) {
ip_rt_put(rth);
- in_dev_put(in_dev);
return;
}
/* Copy all the information. */
*rt = *rth;
- rt->u.dst.__use = 1;
- atomic_set(&rt->u.dst.__refcnt, 1);
- rt->u.dst.child = NULL;
- if (rt->u.dst.dev)
- dev_hold(rt->u.dst.dev);
+ rt->dst.__use = 1;
+ atomic_set(&rt->dst.__refcnt, 1);
+ rt->dst.child = NULL;
+ if (rt->dst.dev)
+ dev_hold(rt->dst.dev);
if (rt->idev)
in_dev_hold(rt->idev);
- rt->u.dst.obsolete = -1;
- rt->u.dst.lastuse = jiffies;
- rt->u.dst.path = &rt->u.dst;
- rt->u.dst.neighbour = NULL;
- rt->u.dst.hh = NULL;
+ rt->dst.obsolete = -1;
+ rt->dst.lastuse = jiffies;
+ rt->dst.path = &rt->dst;
+ rt->dst.neighbour = NULL;
+ rt->dst.hh = NULL;
#ifdef CONFIG_XFRM
- rt->u.dst.xfrm = NULL;
+ rt->dst.xfrm = NULL;
#endif
rt->rt_genid = rt_genid(net);
rt->rt_flags |= RTCF_REDIRECTED;
@@ -1439,23 +1450,23 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
rt->rt_gateway = new_gw;
/* Redirect received -> path was valid */
- dst_confirm(&rth->u.dst);
+ dst_confirm(&rth->dst);
if (rt->peer)
atomic_inc(&rt->peer->refcnt);
- if (arp_bind_neighbour(&rt->u.dst) ||
- !(rt->u.dst.neighbour->nud_state &
+ if (arp_bind_neighbour(&rt->dst) ||
+ !(rt->dst.neighbour->nud_state &
NUD_VALID)) {
- if (rt->u.dst.neighbour)
- neigh_event_send(rt->u.dst.neighbour, NULL);
+ if (rt->dst.neighbour)
+ neigh_event_send(rt->dst.neighbour, NULL);
ip_rt_put(rth);
rt_drop(rt);
goto do_next;
}
- netevent.old = &rth->u.dst;
- netevent.new = &rt->u.dst;
+ netevent.old = &rth->dst;
+ netevent.new = &rt->dst;
call_netevent_notifiers(NETEVENT_REDIRECT,
&netevent);
@@ -1464,12 +1475,10 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
ip_rt_put(rt);
goto do_next;
}
- rcu_read_unlock();
do_next:
;
}
}
- in_dev_put(in_dev);
return;
reject_redirect:
@@ -1480,7 +1489,7 @@ reject_redirect:
&old_gw, dev->name, &new_gw,
&saddr, &daddr);
#endif
- in_dev_put(in_dev);
+ ;
}
static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
@@ -1493,8 +1502,8 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
ip_rt_put(rt);
ret = NULL;
} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
- (rt->u.dst.expires &&
- time_after_eq(jiffies, rt->u.dst.expires))) {
+ (rt->dst.expires &&
+ time_after_eq(jiffies, rt->dst.expires))) {
unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
rt->fl.oif,
rt_genid(dev_net(dst->dev)));
@@ -1532,7 +1541,7 @@ void ip_rt_send_redirect(struct sk_buff *skb)
int log_martians;
rcu_read_lock();
- in_dev = __in_dev_get_rcu(rt->u.dst.dev);
+ in_dev = __in_dev_get_rcu(rt->dst.dev);
if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
rcu_read_unlock();
return;
@@ -1543,30 +1552,30 @@ void ip_rt_send_redirect(struct sk_buff *skb)
/* No redirected packets during ip_rt_redirect_silence;
* reset the algorithm.
*/
- if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
- rt->u.dst.rate_tokens = 0;
+ if (time_after(jiffies, rt->dst.rate_last + ip_rt_redirect_silence))
+ rt->dst.rate_tokens = 0;
/* Too many ignored redirects; do not send anything
- * set u.dst.rate_last to the last seen redirected packet.
+ * set dst.rate_last to the last seen redirected packet.
*/
- if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
- rt->u.dst.rate_last = jiffies;
+ if (rt->dst.rate_tokens >= ip_rt_redirect_number) {
+ rt->dst.rate_last = jiffies;
return;
}
/* Check for load limit; set rate_last to the latest sent
* redirect.
*/
- if (rt->u.dst.rate_tokens == 0 ||
+ if (rt->dst.rate_tokens == 0 ||
time_after(jiffies,
- (rt->u.dst.rate_last +
- (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
+ (rt->dst.rate_last +
+ (ip_rt_redirect_load << rt->dst.rate_tokens)))) {
icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
- rt->u.dst.rate_last = jiffies;
- ++rt->u.dst.rate_tokens;
+ rt->dst.rate_last = jiffies;
+ ++rt->dst.rate_tokens;
#ifdef CONFIG_IP_ROUTE_VERBOSE
if (log_martians &&
- rt->u.dst.rate_tokens == ip_rt_redirect_number &&
+ rt->dst.rate_tokens == ip_rt_redirect_number &&
net_ratelimit())
printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
&rt->rt_src, rt->rt_iif,
@@ -1581,7 +1590,7 @@ static int ip_error(struct sk_buff *skb)
unsigned long now;
int code;
- switch (rt->u.dst.error) {
+ switch (rt->dst.error) {
case EINVAL:
default:
goto out;
@@ -1590,7 +1599,7 @@ static int ip_error(struct sk_buff *skb)
break;
case ENETUNREACH:
code = ICMP_NET_UNREACH;
- IP_INC_STATS_BH(dev_net(rt->u.dst.dev),
+ IP_INC_STATS_BH(dev_net(rt->dst.dev),
IPSTATS_MIB_INNOROUTES);
break;
case EACCES:
@@ -1599,12 +1608,12 @@ static int ip_error(struct sk_buff *skb)
}
now = jiffies;
- rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
- if (rt->u.dst.rate_tokens > ip_rt_error_burst)
- rt->u.dst.rate_tokens = ip_rt_error_burst;
- rt->u.dst.rate_last = now;
- if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
- rt->u.dst.rate_tokens -= ip_rt_error_cost;
+ rt->dst.rate_tokens += now - rt->dst.rate_last;
+ if (rt->dst.rate_tokens > ip_rt_error_burst)
+ rt->dst.rate_tokens = ip_rt_error_burst;
+ rt->dst.rate_last = now;
+ if (rt->dst.rate_tokens >= ip_rt_error_cost) {
+ rt->dst.rate_tokens -= ip_rt_error_cost;
icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
}
@@ -1649,7 +1658,7 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
rcu_read_lock();
for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
- rth = rcu_dereference(rth->u.dst.rt_next)) {
+ rth = rcu_dereference(rth->dst.rt_next)) {
unsigned short mtu = new_mtu;
if (rth->fl.fl4_dst != daddr ||
@@ -1658,8 +1667,8 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
rth->rt_src != iph->saddr ||
rth->fl.oif != ikeys[k] ||
rth->fl.iif != 0 ||
- dst_metric_locked(&rth->u.dst, RTAX_MTU) ||
- !net_eq(dev_net(rth->u.dst.dev), net) ||
+ dst_metric_locked(&rth->dst, RTAX_MTU) ||
+ !net_eq(dev_net(rth->dst.dev), net) ||
rt_is_expired(rth))
continue;
@@ -1667,22 +1676,22 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
/* BSD 4.2 compatibility hack :-( */
if (mtu == 0 &&
- old_mtu >= dst_mtu(&rth->u.dst) &&
+ old_mtu >= dst_mtu(&rth->dst) &&
old_mtu >= 68 + (iph->ihl << 2))
old_mtu -= iph->ihl << 2;
mtu = guess_mtu(old_mtu);
}
- if (mtu <= dst_mtu(&rth->u.dst)) {
- if (mtu < dst_mtu(&rth->u.dst)) {
- dst_confirm(&rth->u.dst);
+ if (mtu <= dst_mtu(&rth->dst)) {
+ if (mtu < dst_mtu(&rth->dst)) {
+ dst_confirm(&rth->dst);
if (mtu < ip_rt_min_pmtu) {
mtu = ip_rt_min_pmtu;
- rth->u.dst.metrics[RTAX_LOCK-1] |=
+ rth->dst.metrics[RTAX_LOCK-1] |=
(1 << RTAX_MTU);
}
- rth->u.dst.metrics[RTAX_MTU-1] = mtu;
- dst_set_expires(&rth->u.dst,
+ rth->dst.metrics[RTAX_MTU-1] = mtu;
+ dst_set_expires(&rth->dst,
ip_rt_mtu_expires);
}
est_mtu = mtu;
@@ -1755,7 +1764,7 @@ static void ipv4_link_failure(struct sk_buff *skb)
rt = skb_rtable(skb);
if (rt)
- dst_set_expires(&rt->u.dst, 0);
+ dst_set_expires(&rt->dst, 0);
}
static int ip_rt_bug(struct sk_buff *skb)
@@ -1783,22 +1792,25 @@ void ip_rt_get_source(u8 *addr, struct rtable *rt)
if (rt->fl.iif == 0)
src = rt->rt_src;
- else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
- src = FIB_RES_PREFSRC(res);
- fib_res_put(&res);
- } else
- src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
+ else {
+ rcu_read_lock();
+ if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0)
+ src = FIB_RES_PREFSRC(res);
+ else
+ src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
RT_SCOPE_UNIVERSE);
+ rcu_read_unlock();
+ }
memcpy(addr, &src, 4);
}
#ifdef CONFIG_NET_CLS_ROUTE
static void set_class_tag(struct rtable *rt, u32 tag)
{
- if (!(rt->u.dst.tclassid & 0xFFFF))
- rt->u.dst.tclassid |= tag & 0xFFFF;
- if (!(rt->u.dst.tclassid & 0xFFFF0000))
- rt->u.dst.tclassid |= tag & 0xFFFF0000;
+ if (!(rt->dst.tclassid & 0xFFFF))
+ rt->dst.tclassid |= tag & 0xFFFF;
+ if (!(rt->dst.tclassid & 0xFFFF0000))
+ rt->dst.tclassid |= tag & 0xFFFF0000;
}
#endif
@@ -1810,30 +1822,30 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
if (FIB_RES_GW(*res) &&
FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
rt->rt_gateway = FIB_RES_GW(*res);
- memcpy(rt->u.dst.metrics, fi->fib_metrics,
- sizeof(rt->u.dst.metrics));
+ memcpy(rt->dst.metrics, fi->fib_metrics,
+ sizeof(rt->dst.metrics));
if (fi->fib_mtu == 0) {
- rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
- if (dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
+ rt->dst.metrics[RTAX_MTU-1] = rt->dst.dev->mtu;
+ if (dst_metric_locked(&rt->dst, RTAX_MTU) &&
rt->rt_gateway != rt->rt_dst &&
- rt->u.dst.dev->mtu > 576)
- rt->u.dst.metrics[RTAX_MTU-1] = 576;
+ rt->dst.dev->mtu > 576)
+ rt->dst.metrics[RTAX_MTU-1] = 576;
}
#ifdef CONFIG_NET_CLS_ROUTE
- rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
+ rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
#endif
} else
- rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
-
- if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
- rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
- if (dst_mtu(&rt->u.dst) > IP_MAX_MTU)
- rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
- if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0)
- rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
+ rt->dst.metrics[RTAX_MTU-1]= rt->dst.dev->mtu;
+
+ if (dst_metric(&rt->dst, RTAX_HOPLIMIT) == 0)
+ rt->dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
+ if (dst_mtu(&rt->dst) > IP_MAX_MTU)
+ rt->dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
+ if (dst_metric(&rt->dst, RTAX_ADVMSS) == 0)
+ rt->dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->dst.dev->mtu - 40,
ip_rt_min_advmss);
- if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40)
- rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
+ if (dst_metric(&rt->dst, RTAX_ADVMSS) > 65535 - 40)
+ rt->dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
#ifdef CONFIG_NET_CLS_ROUTE
#ifdef CONFIG_IP_MULTIPLE_TABLES
@@ -1844,14 +1856,16 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
rt->rt_type = res->type;
}
+/* called in rcu_read_lock() section */
static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
u8 tos, struct net_device *dev, int our)
{
- unsigned hash;
+ unsigned int hash;
struct rtable *rth;
__be32 spec_dst;
- struct in_device *in_dev = in_dev_get(dev);
+ struct in_device *in_dev = __in_dev_get_rcu(dev);
u32 itag = 0;
+ int err;
/* Primary sanity checks. */
@@ -1866,21 +1880,23 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
if (!ipv4_is_local_multicast(daddr))
goto e_inval;
spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
- } else if (fib_validate_source(saddr, 0, tos, 0,
- dev, &spec_dst, &itag, 0) < 0)
- goto e_inval;
-
+ } else {
+ err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
+ &itag, 0);
+ if (err < 0)
+ goto e_err;
+ }
rth = dst_alloc(&ipv4_dst_ops);
if (!rth)
goto e_nobufs;
- rth->u.dst.output = ip_rt_bug;
- rth->u.dst.obsolete = -1;
+ rth->dst.output = ip_rt_bug;
+ rth->dst.obsolete = -1;
- atomic_set(&rth->u.dst.__refcnt, 1);
- rth->u.dst.flags= DST_HOST;
+ atomic_set(&rth->dst.__refcnt, 1);
+ rth->dst.flags= DST_HOST;
if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
- rth->u.dst.flags |= DST_NOPOLICY;
+ rth->dst.flags |= DST_NOPOLICY;
rth->fl.fl4_dst = daddr;
rth->rt_dst = daddr;
rth->fl.fl4_tos = tos;
@@ -1888,13 +1904,13 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
rth->fl.fl4_src = saddr;
rth->rt_src = saddr;
#ifdef CONFIG_NET_CLS_ROUTE
- rth->u.dst.tclassid = itag;
+ rth->dst.tclassid = itag;
#endif
rth->rt_iif =
rth->fl.iif = dev->ifindex;
- rth->u.dst.dev = init_net.loopback_dev;
- dev_hold(rth->u.dst.dev);
- rth->idev = in_dev_get(rth->u.dst.dev);
+ rth->dst.dev = init_net.loopback_dev;
+ dev_hold(rth->dst.dev);
+ rth->idev = in_dev_get(rth->dst.dev);
rth->fl.oif = 0;
rth->rt_gateway = daddr;
rth->rt_spec_dst= spec_dst;
@@ -1902,27 +1918,25 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
rth->rt_flags = RTCF_MULTICAST;
rth->rt_type = RTN_MULTICAST;
if (our) {
- rth->u.dst.input= ip_local_deliver;
+ rth->dst.input= ip_local_deliver;
rth->rt_flags |= RTCF_LOCAL;
}
#ifdef CONFIG_IP_MROUTE
if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
- rth->u.dst.input = ip_mr_input;
+ rth->dst.input = ip_mr_input;
#endif
RT_CACHE_STAT_INC(in_slow_mc);
- in_dev_put(in_dev);
hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex);
e_nobufs:
- in_dev_put(in_dev);
return -ENOBUFS;
-
e_inval:
- in_dev_put(in_dev);
return -EINVAL;
+e_err:
+ return err;
}
@@ -1956,22 +1970,22 @@ static void ip_handle_martian_source(struct net_device *dev,
#endif
}
+/* called in rcu_read_lock() section */
static int __mkroute_input(struct sk_buff *skb,
struct fib_result *res,
struct in_device *in_dev,
__be32 daddr, __be32 saddr, u32 tos,
struct rtable **result)
{
-
struct rtable *rth;
int err;
struct in_device *out_dev;
- unsigned flags = 0;
+ unsigned int flags = 0;
__be32 spec_dst;
u32 itag;
/* get a working reference to the output device */
- out_dev = in_dev_get(FIB_RES_DEV(*res));
+ out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
if (out_dev == NULL) {
if (net_ratelimit())
printk(KERN_CRIT "Bug in ip_route_input" \
@@ -1986,7 +2000,6 @@ static int __mkroute_input(struct sk_buff *skb,
ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
saddr);
- err = -EINVAL;
goto cleanup;
}
@@ -2020,12 +2033,12 @@ static int __mkroute_input(struct sk_buff *skb,
goto cleanup;
}
- atomic_set(&rth->u.dst.__refcnt, 1);
- rth->u.dst.flags= DST_HOST;
+ atomic_set(&rth->dst.__refcnt, 1);
+ rth->dst.flags= DST_HOST;
if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
- rth->u.dst.flags |= DST_NOPOLICY;
+ rth->dst.flags |= DST_NOPOLICY;
if (IN_DEV_CONF_GET(out_dev, NOXFRM))
- rth->u.dst.flags |= DST_NOXFRM;
+ rth->dst.flags |= DST_NOXFRM;
rth->fl.fl4_dst = daddr;
rth->rt_dst = daddr;
rth->fl.fl4_tos = tos;
@@ -2035,16 +2048,16 @@ static int __mkroute_input(struct sk_buff *skb,
rth->rt_gateway = daddr;
rth->rt_iif =
rth->fl.iif = in_dev->dev->ifindex;
- rth->u.dst.dev = (out_dev)->dev;
- dev_hold(rth->u.dst.dev);
- rth->idev = in_dev_get(rth->u.dst.dev);
+ rth->dst.dev = (out_dev)->dev;
+ dev_hold(rth->dst.dev);
+ rth->idev = in_dev_get(rth->dst.dev);
rth->fl.oif = 0;
rth->rt_spec_dst= spec_dst;
- rth->u.dst.obsolete = -1;
- rth->u.dst.input = ip_forward;
- rth->u.dst.output = ip_output;
- rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev));
+ rth->dst.obsolete = -1;
+ rth->dst.input = ip_forward;
+ rth->dst.output = ip_output;
+ rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
rt_set_nexthop(rth, res, itag);
@@ -2053,8 +2066,6 @@ static int __mkroute_input(struct sk_buff *skb,
*result = rth;
err = 0;
cleanup:
- /* release the working reference to the output device */
- in_dev_put(out_dev);
return err;
}
@@ -2080,7 +2091,7 @@ static int ip_mkroute_input(struct sk_buff *skb,
/* put it into the cache */
hash = rt_hash(daddr, saddr, fl->iif,
- rt_genid(dev_net(rth->u.dst.dev)));
+ rt_genid(dev_net(rth->dst.dev)));
return rt_intern_hash(hash, rth, NULL, skb, fl->iif);
}
@@ -2092,13 +2103,14 @@ static int ip_mkroute_input(struct sk_buff *skb,
* Such approach solves two big problems:
* 1. Not simplex devices are handled properly.
* 2. IP spoofing attempts are filtered with 100% of guarantee.
+ * called with rcu_read_lock()
*/
static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
u8 tos, struct net_device *dev)
{
struct fib_result res;
- struct in_device *in_dev = in_dev_get(dev);
+ struct in_device *in_dev = __in_dev_get_rcu(dev);
struct flowi fl = { .nl_u = { .ip4_u =
{ .daddr = daddr,
.saddr = saddr,
@@ -2113,7 +2125,6 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
unsigned hash;
__be32 spec_dst;
int err = -EINVAL;
- int free_res = 0;
struct net * net = dev_net(dev);
/* IP on this device is disabled. */
@@ -2129,7 +2140,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
ipv4_is_loopback(saddr))
goto martian_source;
- if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
+ if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
goto brd_input;
/* Accept zero addresses only to limited broadcast;
@@ -2138,19 +2149,18 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
if (ipv4_is_zeronet(saddr))
goto martian_source;
- if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
- ipv4_is_loopback(daddr))
+ if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
goto martian_destination;
/*
* Now we are ready to route packet.
*/
- if ((err = fib_lookup(net, &fl, &res)) != 0) {
+ err = fib_lookup(net, &fl, &res);
+ if (err != 0) {
if (!IN_DEV_FORWARD(in_dev))
goto e_hostunreach;
goto no_route;
}
- free_res = 1;
RT_CACHE_STAT_INC(in_slow_tot);
@@ -2158,13 +2168,12 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
goto brd_input;
if (res.type == RTN_LOCAL) {
- int result;
- result = fib_validate_source(saddr, daddr, tos,
- net->loopback_dev->ifindex,
- dev, &spec_dst, &itag, skb->mark);
- if (result < 0)
- goto martian_source;
- if (result)
+ err = fib_validate_source(saddr, daddr, tos,
+ net->loopback_dev->ifindex,
+ dev, &spec_dst, &itag, skb->mark);
+ if (err < 0)
+ goto martian_source_keep_err;
+ if (err)
flags |= RTCF_DIRECTSRC;
spec_dst = daddr;
goto local_input;
@@ -2176,10 +2185,6 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
goto martian_destination;
err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
-done:
- in_dev_put(in_dev);
- if (free_res)
- fib_res_put(&res);
out: return err;
brd_input:
@@ -2192,7 +2197,7 @@ brd_input:
err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
&itag, skb->mark);
if (err < 0)
- goto martian_source;
+ goto martian_source_keep_err;
if (err)
flags |= RTCF_DIRECTSRC;
}
@@ -2205,14 +2210,14 @@ local_input:
if (!rth)
goto e_nobufs;
- rth->u.dst.output= ip_rt_bug;
- rth->u.dst.obsolete = -1;
+ rth->dst.output= ip_rt_bug;
+ rth->dst.obsolete = -1;
rth->rt_genid = rt_genid(net);
- atomic_set(&rth->u.dst.__refcnt, 1);
- rth->u.dst.flags= DST_HOST;
+ atomic_set(&rth->dst.__refcnt, 1);
+ rth->dst.flags= DST_HOST;
if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
- rth->u.dst.flags |= DST_NOPOLICY;
+ rth->dst.flags |= DST_NOPOLICY;
rth->fl.fl4_dst = daddr;
rth->rt_dst = daddr;
rth->fl.fl4_tos = tos;
@@ -2220,26 +2225,26 @@ local_input:
rth->fl.fl4_src = saddr;
rth->rt_src = saddr;
#ifdef CONFIG_NET_CLS_ROUTE
- rth->u.dst.tclassid = itag;
+ rth->dst.tclassid = itag;
#endif
rth->rt_iif =
rth->fl.iif = dev->ifindex;
- rth->u.dst.dev = net->loopback_dev;
- dev_hold(rth->u.dst.dev);
- rth->idev = in_dev_get(rth->u.dst.dev);
+ rth->dst.dev = net->loopback_dev;
+ dev_hold(rth->dst.dev);
+ rth->idev = in_dev_get(rth->dst.dev);
rth->rt_gateway = daddr;
rth->rt_spec_dst= spec_dst;
- rth->u.dst.input= ip_local_deliver;
+ rth->dst.input= ip_local_deliver;
rth->rt_flags = flags|RTCF_LOCAL;
if (res.type == RTN_UNREACHABLE) {
- rth->u.dst.input= ip_error;
- rth->u.dst.error= -err;
+ rth->dst.input= ip_error;
+ rth->dst.error= -err;
rth->rt_flags &= ~RTCF_LOCAL;
}
rth->rt_type = res.type;
hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
err = rt_intern_hash(hash, rth, NULL, skb, fl.iif);
- goto done;
+ goto out;
no_route:
RT_CACHE_STAT_INC(in_no_route);
@@ -2262,19 +2267,21 @@ martian_destination:
e_hostunreach:
err = -EHOSTUNREACH;
- goto done;
+ goto out;
e_inval:
err = -EINVAL;
- goto done;
+ goto out;
e_nobufs:
err = -ENOBUFS;
- goto done;
+ goto out;
martian_source:
+ err = -EINVAL;
+martian_source_keep_err:
ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
- goto e_inval;
+ goto out;
}
int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
@@ -2284,32 +2291,34 @@ int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
unsigned hash;
int iif = dev->ifindex;
struct net *net;
+ int res;
net = dev_net(dev);
+ rcu_read_lock();
+
if (!rt_caching(net))
goto skip_cache;
tos &= IPTOS_RT_MASK;
hash = rt_hash(daddr, saddr, iif, rt_genid(net));
- rcu_read_lock();
for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
- rth = rcu_dereference(rth->u.dst.rt_next)) {
+ rth = rcu_dereference(rth->dst.rt_next)) {
if ((((__force u32)rth->fl.fl4_dst ^ (__force u32)daddr) |
((__force u32)rth->fl.fl4_src ^ (__force u32)saddr) |
(rth->fl.iif ^ iif) |
rth->fl.oif |
(rth->fl.fl4_tos ^ tos)) == 0 &&
rth->fl.mark == skb->mark &&
- net_eq(dev_net(rth->u.dst.dev), net) &&
+ net_eq(dev_net(rth->dst.dev), net) &&
!rt_is_expired(rth)) {
if (noref) {
- dst_use_noref(&rth->u.dst, jiffies);
- skb_dst_set_noref(skb, &rth->u.dst);
+ dst_use_noref(&rth->dst, jiffies);
+ skb_dst_set_noref(skb, &rth->dst);
} else {
- dst_use(&rth->u.dst, jiffies);
- skb_dst_set(skb, &rth->u.dst);
+ dst_use(&rth->dst, jiffies);
+ skb_dst_set(skb, &rth->dst);
}
RT_CACHE_STAT_INC(in_hit);
rcu_read_unlock();
@@ -2317,7 +2326,6 @@ int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
}
RT_CACHE_STAT_INC(in_hlist_search);
}
- rcu_read_unlock();
skip_cache:
/* Multicast recognition logic is moved from route cache to here.
@@ -2332,12 +2340,11 @@ skip_cache:
route cache entry is created eventually.
*/
if (ipv4_is_multicast(daddr)) {
- struct in_device *in_dev;
+ struct in_device *in_dev = __in_dev_get_rcu(dev);
- rcu_read_lock();
- if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
+ if (in_dev) {
int our = ip_check_mc(in_dev, daddr, saddr,
- ip_hdr(skb)->protocol);
+ ip_hdr(skb)->protocol);
if (our
#ifdef CONFIG_IP_MROUTE
||
@@ -2345,18 +2352,22 @@ skip_cache:
IN_DEV_MFORWARD(in_dev))
#endif
) {
+ int res = ip_route_input_mc(skb, daddr, saddr,
+ tos, dev, our);
rcu_read_unlock();
- return ip_route_input_mc(skb, daddr, saddr,
- tos, dev, our);
+ return res;
}
}
rcu_read_unlock();
return -EINVAL;
}
- return ip_route_input_slow(skb, daddr, saddr, tos, dev);
+ res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
+ rcu_read_unlock();
+ return res;
}
EXPORT_SYMBOL(ip_route_input_common);
+/* called with rcu_read_lock() */
static int __mkroute_output(struct rtable **result,
struct fib_result *res,
const struct flowi *fl,
@@ -2367,60 +2378,54 @@ static int __mkroute_output(struct rtable **result,
struct rtable *rth;
struct in_device *in_dev;
u32 tos = RT_FL_TOS(oldflp);
- int err = 0;
- if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
+ if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags & IFF_LOOPBACK))
return -EINVAL;
- if (fl->fl4_dst == htonl(0xFFFFFFFF))
+ if (ipv4_is_lbcast(fl->fl4_dst))
res->type = RTN_BROADCAST;
else if (ipv4_is_multicast(fl->fl4_dst))
res->type = RTN_MULTICAST;
- else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
+ else if (ipv4_is_zeronet(fl->fl4_dst))
return -EINVAL;
if (dev_out->flags & IFF_LOOPBACK)
flags |= RTCF_LOCAL;
- /* get work reference to inet device */
- in_dev = in_dev_get(dev_out);
+ in_dev = __in_dev_get_rcu(dev_out);
if (!in_dev)
return -EINVAL;
if (res->type == RTN_BROADCAST) {
flags |= RTCF_BROADCAST | RTCF_LOCAL;
- if (res->fi) {
- fib_info_put(res->fi);
- res->fi = NULL;
- }
+ res->fi = NULL;
} else if (res->type == RTN_MULTICAST) {
- flags |= RTCF_MULTICAST|RTCF_LOCAL;
+ flags |= RTCF_MULTICAST | RTCF_LOCAL;
if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
oldflp->proto))
flags &= ~RTCF_LOCAL;
/* If multicast route do not exist use
- default one, but do not gateway in this case.
- Yes, it is hack.
+ * default one, but do not gateway in this case.
+ * Yes, it is hack.
*/
- if (res->fi && res->prefixlen < 4) {
- fib_info_put(res->fi);
+ if (res->fi && res->prefixlen < 4)
res->fi = NULL;
- }
}
rth = dst_alloc(&ipv4_dst_ops);
- if (!rth) {
- err = -ENOBUFS;
- goto cleanup;
- }
+ if (!rth)
+ return -ENOBUFS;
+
+ in_dev_hold(in_dev);
+ rth->idev = in_dev;
- atomic_set(&rth->u.dst.__refcnt, 1);
- rth->u.dst.flags= DST_HOST;
+ atomic_set(&rth->dst.__refcnt, 1);
+ rth->dst.flags= DST_HOST;
if (IN_DEV_CONF_GET(in_dev, NOXFRM))
- rth->u.dst.flags |= DST_NOXFRM;
+ rth->dst.flags |= DST_NOXFRM;
if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
- rth->u.dst.flags |= DST_NOPOLICY;
+ rth->dst.flags |= DST_NOPOLICY;
rth->fl.fl4_dst = oldflp->fl4_dst;
rth->fl.fl4_tos = tos;
@@ -2432,35 +2437,34 @@ static int __mkroute_output(struct rtable **result,
rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
/* get references to the devices that are to be hold by the routing
cache entry */
- rth->u.dst.dev = dev_out;
+ rth->dst.dev = dev_out;
dev_hold(dev_out);
- rth->idev = in_dev_get(dev_out);
rth->rt_gateway = fl->fl4_dst;
rth->rt_spec_dst= fl->fl4_src;
- rth->u.dst.output=ip_output;
- rth->u.dst.obsolete = -1;
+ rth->dst.output=ip_output;
+ rth->dst.obsolete = -1;
rth->rt_genid = rt_genid(dev_net(dev_out));
RT_CACHE_STAT_INC(out_slow_tot);
if (flags & RTCF_LOCAL) {
- rth->u.dst.input = ip_local_deliver;
+ rth->dst.input = ip_local_deliver;
rth->rt_spec_dst = fl->fl4_dst;
}
if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
rth->rt_spec_dst = fl->fl4_src;
if (flags & RTCF_LOCAL &&
!(dev_out->flags & IFF_LOOPBACK)) {
- rth->u.dst.output = ip_mc_output;
+ rth->dst.output = ip_mc_output;
RT_CACHE_STAT_INC(out_slow_mc);
}
#ifdef CONFIG_IP_MROUTE
if (res->type == RTN_MULTICAST) {
if (IN_DEV_MFORWARD(in_dev) &&
!ipv4_is_local_multicast(oldflp->fl4_dst)) {
- rth->u.dst.input = ip_mr_input;
- rth->u.dst.output = ip_mc_output;
+ rth->dst.input = ip_mr_input;
+ rth->dst.output = ip_mc_output;
}
}
#endif
@@ -2469,15 +2473,11 @@ static int __mkroute_output(struct rtable **result,
rt_set_nexthop(rth, res, 0);
rth->rt_flags = flags;
-
*result = rth;
- cleanup:
- /* release work reference to inet device */
- in_dev_put(in_dev);
-
- return err;
+ return 0;
}
+/* called with rcu_read_lock() */
static int ip_mkroute_output(struct rtable **rp,
struct fib_result *res,
const struct flowi *fl,
@@ -2499,6 +2499,7 @@ static int ip_mkroute_output(struct rtable **rp,
/*
* Major route resolver routine.
+ * called with rcu_read_lock();
*/
static int ip_route_output_slow(struct net *net, struct rtable **rp,
@@ -2517,9 +2518,8 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
.iif = net->loopback_dev->ifindex,
.oif = oldflp->oif };
struct fib_result res;
- unsigned flags = 0;
+ unsigned int flags = 0;
struct net_device *dev_out = NULL;
- int free_res = 0;
int err;
@@ -2545,9 +2545,9 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
if (oldflp->oif == 0 &&
(ipv4_is_multicast(oldflp->fl4_dst) ||
- oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
+ ipv4_is_lbcast(oldflp->fl4_dst))) {
/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
- dev_out = ip_dev_find(net, oldflp->fl4_src);
+ dev_out = __ip_dev_find(net, oldflp->fl4_src, false);
if (dev_out == NULL)
goto out;
@@ -2572,29 +2572,24 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
- dev_out = ip_dev_find(net, oldflp->fl4_src);
- if (dev_out == NULL)
+ if (!__ip_dev_find(net, oldflp->fl4_src, false))
goto out;
- dev_put(dev_out);
- dev_out = NULL;
}
}
if (oldflp->oif) {
- dev_out = dev_get_by_index(net, oldflp->oif);
+ dev_out = dev_get_by_index_rcu(net, oldflp->oif);
err = -ENODEV;
if (dev_out == NULL)
goto out;
/* RACE: Check return value of inet_select_addr instead. */
- if (__in_dev_get_rtnl(dev_out) == NULL) {
- dev_put(dev_out);
+ if (rcu_dereference(dev_out->ip_ptr) == NULL)
goto out; /* Wrong error code */
- }
if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
- oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
+ ipv4_is_lbcast(oldflp->fl4_dst)) {
if (!fl.fl4_src)
fl.fl4_src = inet_select_addr(dev_out, 0,
RT_SCOPE_LINK);
@@ -2614,10 +2609,7 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
fl.fl4_dst = fl.fl4_src;
if (!fl.fl4_dst)
fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
- if (dev_out)
- dev_put(dev_out);
dev_out = net->loopback_dev;
- dev_hold(dev_out);
fl.oif = net->loopback_dev->ifindex;
res.type = RTN_LOCAL;
flags |= RTCF_LOCAL;
@@ -2651,23 +2643,15 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
res.type = RTN_UNICAST;
goto make_route;
}
- if (dev_out)
- dev_put(dev_out);
err = -ENETUNREACH;
goto out;
}
- free_res = 1;
if (res.type == RTN_LOCAL) {
if (!fl.fl4_src)
fl.fl4_src = fl.fl4_dst;
- if (dev_out)
- dev_put(dev_out);
dev_out = net->loopback_dev;
- dev_hold(dev_out);
fl.oif = dev_out->ifindex;
- if (res.fi)
- fib_info_put(res.fi);
res.fi = NULL;
flags |= RTCF_LOCAL;
goto make_route;
@@ -2684,28 +2668,21 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
if (!fl.fl4_src)
fl.fl4_src = FIB_RES_PREFSRC(res);
- if (dev_out)
- dev_put(dev_out);
dev_out = FIB_RES_DEV(res);
- dev_hold(dev_out);
fl.oif = dev_out->ifindex;
make_route:
err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
-
- if (free_res)
- fib_res_put(&res);
- if (dev_out)
- dev_put(dev_out);
out: return err;
}
int __ip_route_output_key(struct net *net, struct rtable **rp,
const struct flowi *flp)
{
- unsigned hash;
+ unsigned int hash;
+ int res;
struct rtable *rth;
if (!rt_caching(net))
@@ -2715,7 +2692,7 @@ int __ip_route_output_key(struct net *net, struct rtable **rp,
rcu_read_lock_bh();
for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
- rth = rcu_dereference_bh(rth->u.dst.rt_next)) {
+ rth = rcu_dereference_bh(rth->dst.rt_next)) {
if (rth->fl.fl4_dst == flp->fl4_dst &&
rth->fl.fl4_src == flp->fl4_src &&
rth->fl.iif == 0 &&
@@ -2723,9 +2700,9 @@ int __ip_route_output_key(struct net *net, struct rtable **rp,
rth->fl.mark == flp->mark &&
!((rth->fl.fl4_tos ^ flp->fl4_tos) &
(IPTOS_RT_MASK | RTO_ONLINK)) &&
- net_eq(dev_net(rth->u.dst.dev), net) &&
+ net_eq(dev_net(rth->dst.dev), net) &&
!rt_is_expired(rth)) {
- dst_use(&rth->u.dst, jiffies);
+ dst_use(&rth->dst, jiffies);
RT_CACHE_STAT_INC(out_hit);
rcu_read_unlock_bh();
*rp = rth;
@@ -2736,11 +2713,18 @@ int __ip_route_output_key(struct net *net, struct rtable **rp,
rcu_read_unlock_bh();
slow_output:
- return ip_route_output_slow(net, rp, flp);
+ rcu_read_lock();
+ res = ip_route_output_slow(net, rp, flp);
+ rcu_read_unlock();
+ return res;
}
-
EXPORT_SYMBOL_GPL(__ip_route_output_key);
+static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
+{
+ return NULL;
+}
+
static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
{
}
@@ -2749,9 +2733,8 @@ static struct dst_ops ipv4_dst_blackhole_ops = {
.family = AF_INET,
.protocol = cpu_to_be16(ETH_P_IP),
.destroy = ipv4_dst_destroy,
- .check = ipv4_dst_check,
+ .check = ipv4_blackhole_dst_check,
.update_pmtu = ipv4_rt_blackhole_update_pmtu,
- .entries = ATOMIC_INIT(0),
};
@@ -2762,15 +2745,15 @@ static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi
dst_alloc(&ipv4_dst_blackhole_ops);
if (rt) {
- struct dst_entry *new = &rt->u.dst;
+ struct dst_entry *new = &rt->dst;
atomic_set(&new->__refcnt, 1);
new->__use = 1;
new->input = dst_discard;
new->output = dst_discard;
- memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
+ memcpy(new->metrics, ort->dst.metrics, RTAX_MAX*sizeof(u32));
- new->dev = ort->u.dst.dev;
+ new->dev = ort->dst.dev;
if (new->dev)
dev_hold(new->dev);
@@ -2794,9 +2777,9 @@ static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi
dst_free(new);
}
- dst_release(&(*rp)->u.dst);
+ dst_release(&(*rp)->dst);
*rp = rt;
- return (rt ? 0 : -ENOMEM);
+ return rt ? 0 : -ENOMEM;
}
int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
@@ -2822,13 +2805,13 @@ int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
return 0;
}
-
EXPORT_SYMBOL_GPL(ip_route_output_flow);
int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
{
return ip_route_output_flow(net, rp, flp, NULL, 0);
}
+EXPORT_SYMBOL(ip_route_output_key);
static int rt_fill_info(struct net *net,
struct sk_buff *skb, u32 pid, u32 seq, int event,
@@ -2864,11 +2847,11 @@ static int rt_fill_info(struct net *net,
r->rtm_src_len = 32;
NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
}
- if (rt->u.dst.dev)
- NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
+ if (rt->dst.dev)
+ NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
#ifdef CONFIG_NET_CLS_ROUTE
- if (rt->u.dst.tclassid)
- NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
+ if (rt->dst.tclassid)
+ NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
#endif
if (rt->fl.iif)
NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
@@ -2878,12 +2861,16 @@ static int rt_fill_info(struct net *net,
if (rt->rt_dst != rt->rt_gateway)
NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
- if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
+ if (rtnetlink_put_metrics(skb, rt->dst.metrics) < 0)
goto nla_put_failure;
- error = rt->u.dst.error;
- expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
+ if (rt->fl.mark)
+ NLA_PUT_BE32(skb, RTA_MARK, rt->fl.mark);
+
+ error = rt->dst.error;
+ expires = rt->dst.expires ? rt->dst.expires - jiffies : 0;
if (rt->peer) {
+ inet_peer_refcheck(rt->peer);
id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
if (rt->peer->tcp_ts_stamp) {
ts = rt->peer->tcp_ts;
@@ -2914,7 +2901,7 @@ static int rt_fill_info(struct net *net,
NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
}
- if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
+ if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
expires, error) < 0)
goto nla_put_failure;
@@ -2935,6 +2922,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
__be32 src = 0;
u32 iif;
int err;
+ int mark;
struct sk_buff *skb;
err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
@@ -2962,6 +2950,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
+ mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
if (iif) {
struct net_device *dev;
@@ -2974,13 +2963,14 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
skb->protocol = htons(ETH_P_IP);
skb->dev = dev;
+ skb->mark = mark;
local_bh_disable();
err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
local_bh_enable();
rt = skb_rtable(skb);
- if (err == 0 && rt->u.dst.error)
- err = -rt->u.dst.error;
+ if (err == 0 && rt->dst.error)
+ err = -rt->dst.error;
} else {
struct flowi fl = {
.nl_u = {
@@ -2991,6 +2981,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
},
},
.oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
+ .mark = mark,
};
err = ip_route_output_key(net, &rt, &fl);
}
@@ -2998,7 +2989,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
if (err)
goto errout_free;
- skb_dst_set(skb, &rt->u.dst);
+ skb_dst_set(skb, &rt->dst);
if (rtm->rtm_flags & RTM_F_NOTIFY)
rt->rt_flags |= RTCF_NOTIFY;
@@ -3034,12 +3025,12 @@ int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
continue;
rcu_read_lock_bh();
for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
- rt = rcu_dereference_bh(rt->u.dst.rt_next), idx++) {
- if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
+ rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
+ if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
continue;
if (rt_is_expired(rt))
continue;
- skb_dst_set_noref(skb, &rt->u.dst);
+ skb_dst_set_noref(skb, &rt->dst);
if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
cb->nlh->nlmsg_seq, RTM_NEWROUTE,
1, NLM_F_MULTI) <= 0) {
@@ -3313,6 +3304,12 @@ int __init ip_rt_init(void)
ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
+ if (dst_entries_init(&ipv4_dst_ops) < 0)
+ panic("IP: failed to allocate ipv4_dst_ops counter\n");
+
+ if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
+ panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
+
rt_hash_table = (struct rt_hash_bucket *)
alloc_large_system_hash("IP route cache",
sizeof(struct rt_hash_bucket),
@@ -3365,6 +3362,3 @@ void __init ip_static_sysctl_init(void)
register_sysctl_paths(ipv4_path, ipv4_skeleton);
}
#endif
-
-EXPORT_SYMBOL(__ip_select_ident);
-EXPORT_SYMBOL(ip_route_output_key);
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 9f6b22206c52..650cace2180d 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -18,8 +18,8 @@
#include <net/tcp.h>
#include <net/route.h>
-/* Timestamps: lowest 9 bits store TCP options */
-#define TSBITS 9
+/* Timestamps: lowest bits store TCP options */
+#define TSBITS 6
#define TSMASK (((__u32)1 << TSBITS) - 1)
extern int sysctl_tcp_syncookies;
@@ -58,7 +58,7 @@ static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
/*
* when syncookies are in effect and tcp timestamps are enabled we encode
- * tcp options in the lowest 9 bits of the timestamp value that will be
+ * tcp options in the lower bits of the timestamp value that will be
* sent in the syn-ack.
* Since subsequent timestamps use the normal tcp_time_stamp value, we
* must make sure that the resulting initial timestamp is <= tcp_time_stamp.
@@ -70,11 +70,10 @@ __u32 cookie_init_timestamp(struct request_sock *req)
u32 options = 0;
ireq = inet_rsk(req);
- if (ireq->wscale_ok) {
- options = ireq->snd_wscale;
- options |= ireq->rcv_wscale << 4;
- }
- options |= ireq->sack_ok << 8;
+
+ options = ireq->wscale_ok ? ireq->snd_wscale : 0xf;
+ options |= ireq->sack_ok << 4;
+ options |= ireq->ecn_ok << 5;
ts = ts_now & ~TSMASK;
ts |= options;
@@ -138,23 +137,23 @@ static __u32 check_tcp_syn_cookie(__u32 cookie, __be32 saddr, __be32 daddr,
}
/*
- * This table has to be sorted and terminated with (__u16)-1.
- * XXX generate a better table.
- * Unresolved Issues: HIPPI with a 64k MSS is not well supported.
+ * MSS Values are taken from the 2009 paper
+ * 'Measuring TCP Maximum Segment Size' by S. Alcock and R. Nelson:
+ * - values 1440 to 1460 accounted for 80% of observed mss values
+ * - values outside the 536-1460 range are rare (<0.2%).
+ *
+ * Table must be sorted.
*/
static __u16 const msstab[] = {
- 64 - 1,
- 256 - 1,
- 512 - 1,
- 536 - 1,
- 1024 - 1,
- 1440 - 1,
- 1460 - 1,
- 4312 - 1,
- (__u16)-1
+ 64,
+ 512,
+ 536,
+ 1024,
+ 1440,
+ 1460,
+ 4312,
+ 8960,
};
-/* The number doesn't include the -1 terminator */
-#define NUM_MSS (ARRAY_SIZE(msstab) - 1)
/*
* Generate a syncookie. mssp points to the mss, which is returned
@@ -169,10 +168,10 @@ __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp)
tcp_synq_overflow(sk);
- /* XXX sort msstab[] by probability? Binary search? */
- for (mssind = 0; mss > msstab[mssind + 1]; mssind++)
- ;
- *mssp = msstab[mssind] + 1;
+ for (mssind = ARRAY_SIZE(msstab) - 1; mssind ; mssind--)
+ if (mss >= msstab[mssind])
+ break;
+ *mssp = msstab[mssind];
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT);
@@ -202,7 +201,7 @@ static inline int cookie_check(struct sk_buff *skb, __u32 cookie)
jiffies / (HZ * 60),
COUNTER_TRIES);
- return mssind < NUM_MSS ? msstab[mssind] + 1 : 0;
+ return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0;
}
static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
@@ -227,26 +226,38 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
* additional tcp options in the timestamp.
* This extracts these options from the timestamp echo.
*
- * The lowest 4 bits are for snd_wscale
- * The next 4 lsb are for rcv_wscale
- * The next lsb is for sack_ok
+ * The lowest 4 bits store snd_wscale.
+ * next 2 bits indicate SACK and ECN support.
+ *
+ * return false if we decode an option that should not be.
*/
-void cookie_check_timestamp(struct tcp_options_received *tcp_opt)
+bool cookie_check_timestamp(struct tcp_options_received *tcp_opt, bool *ecn_ok)
{
- /* echoed timestamp, 9 lowest bits contain options */
+ /* echoed timestamp, lowest bits contain options */
u32 options = tcp_opt->rcv_tsecr & TSMASK;
- tcp_opt->snd_wscale = options & 0xf;
- options >>= 4;
- tcp_opt->rcv_wscale = options & 0xf;
+ if (!tcp_opt->saw_tstamp) {
+ tcp_clear_options(tcp_opt);
+ return true;
+ }
+
+ if (!sysctl_tcp_timestamps)
+ return false;
tcp_opt->sack_ok = (options >> 4) & 0x1;
+ *ecn_ok = (options >> 5) & 1;
+ if (*ecn_ok && !sysctl_tcp_ecn)
+ return false;
+
+ if (tcp_opt->sack_ok && !sysctl_tcp_sack)
+ return false;
- if (tcp_opt->sack_ok)
- tcp_sack_reset(tcp_opt);
+ if ((options & 0xf) == 0xf)
+ return true; /* no window scaling */
- if (tcp_opt->snd_wscale || tcp_opt->rcv_wscale)
- tcp_opt->wscale_ok = 1;
+ tcp_opt->wscale_ok = 1;
+ tcp_opt->snd_wscale = options & 0xf;
+ return sysctl_tcp_window_scaling != 0;
}
EXPORT_SYMBOL(cookie_check_timestamp);
@@ -265,8 +276,9 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
int mss;
struct rtable *rt;
__u8 rcv_wscale;
+ bool ecn_ok;
- if (!sysctl_tcp_syncookies || !th->ack)
+ if (!sysctl_tcp_syncookies || !th->ack || th->rst)
goto out;
if (tcp_synq_no_recent_overflow(sk) ||
@@ -281,8 +293,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
memset(&tcp_opt, 0, sizeof(tcp_opt));
tcp_parse_options(skb, &tcp_opt, &hash_location, 0);
- if (tcp_opt.saw_tstamp)
- cookie_check_timestamp(&tcp_opt);
+ if (!cookie_check_timestamp(&tcp_opt, &ecn_ok))
+ goto out;
ret = NULL;
req = inet_reqsk_alloc(&tcp_request_sock_ops); /* for safety */
@@ -298,9 +310,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
ireq->rmt_port = th->source;
ireq->loc_addr = ip_hdr(skb)->daddr;
ireq->rmt_addr = ip_hdr(skb)->saddr;
- ireq->ecn_ok = 0;
+ ireq->ecn_ok = ecn_ok;
ireq->snd_wscale = tcp_opt.snd_wscale;
- ireq->rcv_wscale = tcp_opt.rcv_wscale;
ireq->sack_ok = tcp_opt.sack_ok;
ireq->wscale_ok = tcp_opt.wscale_ok;
ireq->tstamp_ok = tcp_opt.saw_tstamp;
@@ -354,15 +365,15 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
}
/* Try to redo what tcp_v4_send_synack did. */
- req->window_clamp = tp->window_clamp ? :dst_metric(&rt->u.dst, RTAX_WINDOW);
+ req->window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW);
tcp_select_initial_window(tcp_full_space(sk), req->mss,
&req->rcv_wnd, &req->window_clamp,
ireq->wscale_ok, &rcv_wscale,
- dst_metric(&rt->u.dst, RTAX_INITRWND));
+ dst_metric(&rt->dst, RTAX_INITRWND));
ireq->rcv_wscale = rcv_wscale;
- ret = get_cookie_sock(sk, skb, req, &rt->u.dst);
+ ret = get_cookie_sock(sk, skb, req, &rt->dst);
out: return ret;
}
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 65afeaec15b7..1664a0590bb8 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -315,7 +315,6 @@ struct tcp_splice_state {
* is strict, actions are advisory and have some latency.
*/
int tcp_memory_pressure __read_mostly;
-
EXPORT_SYMBOL(tcp_memory_pressure);
void tcp_enter_memory_pressure(struct sock *sk)
@@ -325,7 +324,6 @@ void tcp_enter_memory_pressure(struct sock *sk)
tcp_memory_pressure = 1;
}
}
-
EXPORT_SYMBOL(tcp_enter_memory_pressure);
/* Convert seconds to retransmits based on initial and max timeout */
@@ -388,8 +386,6 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
*/
mask = 0;
- if (sk->sk_err)
- mask = POLLERR;
/*
* POLLHUP is certainly not done right. But poll() doesn't
@@ -453,13 +449,20 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
mask |= POLLOUT | POLLWRNORM;
}
- }
+ } else
+ mask |= POLLOUT | POLLWRNORM;
if (tp->urg_data & TCP_URG_VALID)
mask |= POLLPRI;
}
+ /* This barrier is coupled with smp_wmb() in tcp_reset() */
+ smp_rmb();
+ if (sk->sk_err)
+ mask |= POLLERR;
+
return mask;
}
+EXPORT_SYMBOL(tcp_poll);
int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
{
@@ -508,10 +511,11 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
return put_user(answ, (int __user *)arg);
}
+EXPORT_SYMBOL(tcp_ioctl);
static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
{
- TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
+ TCP_SKB_CB(skb)->flags |= TCPHDR_PSH;
tp->pushed_seq = tp->write_seq;
}
@@ -527,7 +531,7 @@ static inline void skb_entail(struct sock *sk, struct sk_buff *skb)
skb->csum = 0;
tcb->seq = tcb->end_seq = tp->write_seq;
- tcb->flags = TCPCB_FLAG_ACK;
+ tcb->flags = TCPHDR_ACK;
tcb->sacked = 0;
skb_header_release(skb);
tcp_add_write_queue_tail(sk, skb);
@@ -676,6 +680,7 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
return ret;
}
+EXPORT_SYMBOL(tcp_splice_read);
struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
{
@@ -816,7 +821,7 @@ new_segment:
skb_shinfo(skb)->gso_segs = 0;
if (!copied)
- TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
+ TCP_SKB_CB(skb)->flags &= ~TCPHDR_PSH;
copied += copy;
poffset += copy;
@@ -857,15 +862,15 @@ out_err:
return sk_stream_error(sk, flags, err);
}
-ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
- size_t size, int flags)
+int tcp_sendpage(struct sock *sk, struct page *page, int offset,
+ size_t size, int flags)
{
ssize_t res;
- struct sock *sk = sock->sk;
if (!(sk->sk_route_caps & NETIF_F_SG) ||
!(sk->sk_route_caps & NETIF_F_ALL_CSUM))
- return sock_no_sendpage(sock, page, offset, size, flags);
+ return sock_no_sendpage(sk->sk_socket, page, offset, size,
+ flags);
lock_sock(sk);
TCP_CHECK_TIMER(sk);
@@ -874,6 +879,7 @@ ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
release_sock(sk);
return res;
}
+EXPORT_SYMBOL(tcp_sendpage);
#define TCP_PAGE(sk) (sk->sk_sndmsg_page)
#define TCP_OFF(sk) (sk->sk_sndmsg_off)
@@ -898,10 +904,9 @@ static inline int select_size(struct sock *sk, int sg)
return tmp;
}
-int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
+int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
size_t size)
{
- struct sock *sk = sock->sk;
struct iovec *iov;
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
@@ -938,7 +943,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
sg = sk->sk_route_caps & NETIF_F_SG;
while (--iovlen >= 0) {
- int seglen = iov->iov_len;
+ size_t seglen = iov->iov_len;
unsigned char __user *from = iov->iov_base;
iov++;
@@ -1062,7 +1067,7 @@ new_segment:
}
if (!copied)
- TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
+ TCP_SKB_CB(skb)->flags &= ~TCPHDR_PSH;
tp->write_seq += copy;
TCP_SKB_CB(skb)->end_seq += copy;
@@ -1122,6 +1127,7 @@ out_err:
release_sock(sk);
return err;
}
+EXPORT_SYMBOL(tcp_sendmsg);
/*
* Handle reading urgent data. BSD has very simple semantics for
@@ -1381,6 +1387,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
tcp_cleanup_rbuf(sk, copied);
return copied;
}
+EXPORT_SYMBOL(tcp_read_sock);
/*
* This routine copies from a sock struct into the user buffer.
@@ -1775,6 +1782,7 @@ recv_urg:
err = tcp_recv_urg(sk, msg, len, flags);
goto out;
}
+EXPORT_SYMBOL(tcp_recvmsg);
void tcp_set_state(struct sock *sk, int state)
{
@@ -1867,6 +1875,7 @@ void tcp_shutdown(struct sock *sk, int how)
tcp_send_fin(sk);
}
}
+EXPORT_SYMBOL(tcp_shutdown);
void tcp_close(struct sock *sk, long timeout)
{
@@ -1899,6 +1908,10 @@ void tcp_close(struct sock *sk, long timeout)
sk_mem_reclaim(sk);
+ /* If socket has been already reset (e.g. in tcp_reset()) - kill it. */
+ if (sk->sk_state == TCP_CLOSE)
+ goto adjudge_to_death;
+
/* As outlined in RFC 2525, section 2.17, we send a RST here because
* data was lost. To witness the awful effects of the old behavior of
* always doing a FIN, run an older 2.1.x kernel or 2.0.x, start a bulk
@@ -2002,11 +2015,8 @@ adjudge_to_death:
}
}
if (sk->sk_state != TCP_CLOSE) {
- int orphan_count = percpu_counter_read_positive(
- sk->sk_prot->orphan_count);
-
sk_mem_reclaim(sk);
- if (tcp_too_many_orphans(sk, orphan_count)) {
+ if (tcp_too_many_orphans(sk, 0)) {
if (net_ratelimit())
printk(KERN_INFO "TCP: too many of orphaned "
"sockets\n");
@@ -2026,6 +2036,7 @@ out:
local_bh_enable();
sock_put(sk);
}
+EXPORT_SYMBOL(tcp_close);
/* These states need RST on ABORT according to RFC793 */
@@ -2099,6 +2110,7 @@ int tcp_disconnect(struct sock *sk, int flags)
sk->sk_error_report(sk);
return err;
}
+EXPORT_SYMBOL(tcp_disconnect);
/*
* Socket option code for TCP.
@@ -2176,6 +2188,8 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
GFP_KERNEL);
if (cvp == NULL)
return -ENOMEM;
+
+ kref_init(&cvp->kref);
}
lock_sock(sk);
tp->rx_opt.cookie_in_always =
@@ -2190,12 +2204,11 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
*/
kref_put(&tp->cookie_values->kref,
tcp_cookie_values_release);
- kref_init(&cvp->kref);
- tp->cookie_values = cvp;
} else {
cvp = tp->cookie_values;
}
}
+
if (cvp != NULL) {
cvp->cookie_desired = ctd.tcpct_cookie_desired;
@@ -2209,6 +2222,8 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
cvp->s_data_desired = ctd.tcpct_s_data_desired;
cvp->s_data_constant = 0; /* false */
}
+
+ tp->cookie_values = cvp;
}
release_sock(sk);
return err;
@@ -2377,7 +2392,12 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
err = tp->af_specific->md5_parse(sk, optval, optlen);
break;
#endif
-
+ case TCP_USER_TIMEOUT:
+ /* Cap the max timeout in ms TCP will retry/retrans
+ * before giving up and aborting (ETIMEDOUT) a connection.
+ */
+ icsk->icsk_user_timeout = msecs_to_jiffies(val);
+ break;
default:
err = -ENOPROTOOPT;
break;
@@ -2397,6 +2417,7 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
optval, optlen);
return do_tcp_setsockopt(sk, level, optname, optval, optlen);
}
+EXPORT_SYMBOL(tcp_setsockopt);
#ifdef CONFIG_COMPAT
int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
@@ -2407,7 +2428,6 @@ int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
optval, optlen);
return do_tcp_setsockopt(sk, level, optname, optval, optlen);
}
-
EXPORT_SYMBOL(compat_tcp_setsockopt);
#endif
@@ -2473,7 +2493,6 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_total_retrans = tp->total_retrans;
}
-
EXPORT_SYMBOL_GPL(tcp_get_info);
static int do_tcp_getsockopt(struct sock *sk, int level,
@@ -2591,6 +2610,16 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
return -EFAULT;
return 0;
}
+ case TCP_THIN_LINEAR_TIMEOUTS:
+ val = tp->thin_lto;
+ break;
+ case TCP_THIN_DUPACK:
+ val = tp->thin_dupack;
+ break;
+
+ case TCP_USER_TIMEOUT:
+ val = jiffies_to_msecs(icsk->icsk_user_timeout);
+ break;
default:
return -ENOPROTOOPT;
}
@@ -2612,6 +2641,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
optval, optlen);
return do_tcp_getsockopt(sk, level, optname, optval, optlen);
}
+EXPORT_SYMBOL(tcp_getsockopt);
#ifdef CONFIG_COMPAT
int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
@@ -2622,7 +2652,6 @@ int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
optval, optlen);
return do_tcp_getsockopt(sk, level, optname, optval, optlen);
}
-
EXPORT_SYMBOL(compat_tcp_getsockopt);
#endif
@@ -2859,7 +2888,6 @@ void tcp_free_md5sig_pool(void)
if (pool)
__tcp_free_md5sig_pool(pool);
}
-
EXPORT_SYMBOL(tcp_free_md5sig_pool);
static struct tcp_md5sig_pool * __percpu *
@@ -2935,7 +2963,6 @@ retry:
}
return pool;
}
-
EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
@@ -2959,7 +2986,7 @@ struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
spin_unlock(&tcp_md5sig_pool_lock);
if (p)
- return *per_cpu_ptr(p, smp_processor_id());
+ return *this_cpu_ptr(p);
local_bh_enable();
return NULL;
@@ -2987,7 +3014,6 @@ int tcp_md5_hash_header(struct tcp_md5sig_pool *hp,
th->check = old_checksum;
return err;
}
-
EXPORT_SYMBOL(tcp_md5_hash_header);
int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
@@ -3000,6 +3026,7 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
const unsigned head_data_len = skb_headlen(skb) > header_len ?
skb_headlen(skb) - header_len : 0;
const struct skb_shared_info *shi = skb_shinfo(skb);
+ struct sk_buff *frag_iter;
sg_init_table(&sg, 1);
@@ -3014,9 +3041,12 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
return 1;
}
+ skb_walk_frags(skb, frag_iter)
+ if (tcp_md5_hash_skb_data(hp, frag_iter, 0))
+ return 1;
+
return 0;
}
-
EXPORT_SYMBOL(tcp_md5_hash_skb_data);
int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, struct tcp_md5sig_key *key)
@@ -3026,7 +3056,6 @@ int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, struct tcp_md5sig_key *key)
sg_init_one(&sg, key->key, key->keylen);
return crypto_hash_update(&hp->md5_desc, &sg, key->keylen);
}
-
EXPORT_SYMBOL(tcp_md5_hash_key);
#endif
@@ -3193,7 +3222,7 @@ void __init tcp_init(void)
{
struct sk_buff *skb = NULL;
unsigned long nr_pages, limit;
- int order, i, max_share;
+ int i, max_share, cnt;
unsigned long jiffy = jiffies;
BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb));
@@ -3242,22 +3271,12 @@ void __init tcp_init(void)
INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
}
- /* Try to be a bit smarter and adjust defaults depending
- * on available memory.
- */
- for (order = 0; ((1 << order) << PAGE_SHIFT) <
- (tcp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket));
- order++)
- ;
- if (order >= 4) {
- tcp_death_row.sysctl_max_tw_buckets = 180000;
- sysctl_tcp_max_orphans = 4096 << (order - 4);
- sysctl_max_syn_backlog = 1024;
- } else if (order < 3) {
- tcp_death_row.sysctl_max_tw_buckets >>= (3 - order);
- sysctl_tcp_max_orphans >>= (3 - order);
- sysctl_max_syn_backlog = 128;
- }
+
+ cnt = tcp_hashinfo.ehash_mask + 1;
+
+ tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
+ sysctl_tcp_max_orphans = cnt / 2;
+ sysctl_max_syn_backlog = max(128, cnt / 256);
/* Set the pressure threshold to be a fraction of global memory that
* is up to 1/2 at 256 MB, decreasing toward zero with the amount of
@@ -3298,16 +3317,3 @@ void __init tcp_init(void)
tcp_secret_retiring = &tcp_secret_two;
tcp_secret_secondary = &tcp_secret_two;
}
-
-EXPORT_SYMBOL(tcp_close);
-EXPORT_SYMBOL(tcp_disconnect);
-EXPORT_SYMBOL(tcp_getsockopt);
-EXPORT_SYMBOL(tcp_ioctl);
-EXPORT_SYMBOL(tcp_poll);
-EXPORT_SYMBOL(tcp_read_sock);
-EXPORT_SYMBOL(tcp_recvmsg);
-EXPORT_SYMBOL(tcp_sendmsg);
-EXPORT_SYMBOL(tcp_splice_read);
-EXPORT_SYMBOL(tcp_sendpage);
-EXPORT_SYMBOL(tcp_setsockopt);
-EXPORT_SYMBOL(tcp_shutdown);
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 0ec9bd0ae94f..850c737e08e2 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -196,10 +196,10 @@ void tcp_get_allowed_congestion_control(char *buf, size_t maxlen)
int tcp_set_allowed_congestion_control(char *val)
{
struct tcp_congestion_ops *ca;
- char *clone, *name;
+ char *saved_clone, *clone, *name;
int ret = 0;
- clone = kstrdup(val, GFP_USER);
+ saved_clone = clone = kstrdup(val, GFP_USER);
if (!clone)
return -ENOMEM;
@@ -226,6 +226,7 @@ int tcp_set_allowed_congestion_control(char *val)
}
out:
spin_unlock(&tcp_cong_list_lock);
+ kfree(saved_clone);
return ret;
}
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c
index 1eba160b72dc..00ca688d8964 100644
--- a/net/ipv4/tcp_illinois.c
+++ b/net/ipv4/tcp_illinois.c
@@ -6,7 +6,7 @@
* The algorithm is described in:
* "TCP-Illinois: A Loss and Delay-Based Congestion Control Algorithm
* for High-Speed Networks"
- * http://www.ews.uiuc.edu/~shaoliu/papersandslides/liubassri06perf.pdf
+ * http://www.ifp.illinois.edu/~srikant/Papers/liubassri06perf.pdf
*
* Implemented from description in paper and ns-2 simulation.
* Copyright (C) 2007 Stephen Hemminger <shemminger@linux-foundation.org>
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 548d575e6cc6..3357f69e353d 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -78,10 +78,13 @@ int sysctl_tcp_window_scaling __read_mostly = 1;
int sysctl_tcp_sack __read_mostly = 1;
int sysctl_tcp_fack __read_mostly = 1;
int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH;
+EXPORT_SYMBOL(sysctl_tcp_reordering);
int sysctl_tcp_ecn __read_mostly = 2;
+EXPORT_SYMBOL(sysctl_tcp_ecn);
int sysctl_tcp_dsack __read_mostly = 1;
int sysctl_tcp_app_win __read_mostly = 31;
int sysctl_tcp_adv_win_scale __read_mostly = 2;
+EXPORT_SYMBOL(sysctl_tcp_adv_win_scale);
int sysctl_tcp_stdurg __read_mostly;
int sysctl_tcp_rfc1337 __read_mostly;
@@ -179,7 +182,7 @@ static void tcp_incr_quickack(struct sock *sk)
icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS);
}
-void tcp_enter_quickack_mode(struct sock *sk)
+static void tcp_enter_quickack_mode(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
tcp_incr_quickack(sk);
@@ -419,15 +422,16 @@ void tcp_initialize_rcv_mss(struct sock *sk)
inet_csk(sk)->icsk_ack.rcv_mss = hint;
}
+EXPORT_SYMBOL(tcp_initialize_rcv_mss);
/* Receiver "autotuning" code.
*
* The algorithm for RTT estimation w/o timestamps is based on
* Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL.
- * <http://www.lanl.gov/radiant/website/pubs/drs/lacsi2001.ps>
+ * <http://public.lanl.gov/radiant/pubs.html#DRS>
*
* More detail on this code can be found at
- * <http://www.psc.edu/~jheffner/senior_thesis.ps>,
+ * <http://staff.psc.edu/jheffner/>,
* though this reference is out of date. A new paper
* is pending.
*/
@@ -801,25 +805,12 @@ void tcp_update_metrics(struct sock *sk)
}
}
-/* Numbers are taken from RFC3390.
- *
- * John Heffner states:
- *
- * The RFC specifies a window of no more than 4380 bytes
- * unless 2*MSS > 4380. Reading the pseudocode in the RFC
- * is a bit misleading because they use a clamp at 4380 bytes
- * rather than use a multiplier in the relevant range.
- */
__u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
{
__u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
- if (!cwnd) {
- if (tp->mss_cache > 1460)
- cwnd = 2;
- else
- cwnd = (tp->mss_cache > 1095) ? 3 : 4;
- }
+ if (!cwnd)
+ cwnd = rfc3390_bytes_to_packets(tp->mss_cache);
return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
}
@@ -2310,7 +2301,7 @@ static inline int tcp_dupack_heuristics(struct tcp_sock *tp)
static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb)
{
- return (tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto);
+ return tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto;
}
static inline int tcp_head_timedout(struct sock *sk)
@@ -2504,7 +2495,7 @@ static void tcp_timeout_skbs(struct sock *sk)
/* Mark head of queue up as lost. With RFC3517 SACK, the packets is
* is against sacked "cnt", otherwise it's against facked "cnt"
*/
-static void tcp_mark_head_lost(struct sock *sk, int packets)
+static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
@@ -2512,13 +2503,13 @@ static void tcp_mark_head_lost(struct sock *sk, int packets)
int err;
unsigned int mss;
- if (packets == 0)
- return;
-
WARN_ON(packets > tp->packets_out);
if (tp->lost_skb_hint) {
skb = tp->lost_skb_hint;
cnt = tp->lost_cnt_hint;
+ /* Head already handled? */
+ if (mark_head && skb != tcp_write_queue_head(sk))
+ return;
} else {
skb = tcp_write_queue_head(sk);
cnt = 0;
@@ -2541,7 +2532,8 @@ static void tcp_mark_head_lost(struct sock *sk, int packets)
cnt += tcp_skb_pcount(skb);
if (cnt > packets) {
- if (tcp_is_sack(tp) || (oldcnt >= packets))
+ if ((tcp_is_sack(tp) && !tcp_is_fack(tp)) ||
+ (oldcnt >= packets))
break;
mss = skb_shinfo(skb)->gso_size;
@@ -2552,6 +2544,9 @@ static void tcp_mark_head_lost(struct sock *sk, int packets)
}
tcp_skb_mark_lost(tp, skb);
+
+ if (mark_head)
+ break;
}
tcp_verify_left_out(tp);
}
@@ -2563,17 +2558,18 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
struct tcp_sock *tp = tcp_sk(sk);
if (tcp_is_reno(tp)) {
- tcp_mark_head_lost(sk, 1);
+ tcp_mark_head_lost(sk, 1, 1);
} else if (tcp_is_fack(tp)) {
int lost = tp->fackets_out - tp->reordering;
if (lost <= 0)
lost = 1;
- tcp_mark_head_lost(sk, lost);
+ tcp_mark_head_lost(sk, lost, 0);
} else {
int sacked_upto = tp->sacked_out - tp->reordering;
- if (sacked_upto < fast_rexmit)
- sacked_upto = fast_rexmit;
- tcp_mark_head_lost(sk, sacked_upto);
+ if (sacked_upto >= 0)
+ tcp_mark_head_lost(sk, sacked_upto, 0);
+ else if (fast_rexmit)
+ tcp_mark_head_lost(sk, 1, 1);
}
tcp_timeout_skbs(sk);
@@ -2882,7 +2878,7 @@ static void tcp_mtup_probe_success(struct sock *sk)
icsk->icsk_mtup.probe_size;
tp->snd_cwnd_cnt = 0;
tp->snd_cwnd_stamp = tcp_time_stamp;
- tp->rcv_ssthresh = tcp_current_ssthresh(sk);
+ tp->snd_ssthresh = tcp_current_ssthresh(sk);
icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
icsk->icsk_mtup.probe_size = 0;
@@ -2938,6 +2934,7 @@ void tcp_simple_retransmit(struct sock *sk)
}
tcp_xmit_retransmit_queue(sk);
}
+EXPORT_SYMBOL(tcp_simple_retransmit);
/* Process an event, which can update packets-in-flight not trivially.
* Main goal of this function is to calculate new estimate for left_out,
@@ -2978,7 +2975,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag)
before(tp->snd_una, tp->high_seq) &&
icsk->icsk_ca_state != TCP_CA_Open &&
tp->fackets_out > tp->reordering) {
- tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering);
+ tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering, 0);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSS);
}
@@ -3286,7 +3283,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
* connection startup slow start one packet too
* quickly. This is severely frowned upon behavior.
*/
- if (!(scb->flags & TCPCB_FLAG_SYN)) {
+ if (!(scb->flags & TCPHDR_SYN)) {
flag |= FLAG_DATA_ACKED;
} else {
flag |= FLAG_SYN_ACKED;
@@ -3406,8 +3403,8 @@ static void tcp_ack_probe(struct sock *sk)
static inline int tcp_ack_is_dubious(const struct sock *sk, const int flag)
{
- return (!(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
- inet_csk(sk)->icsk_ca_state != TCP_CA_Open);
+ return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
+ inet_csk(sk)->icsk_ca_state != TCP_CA_Open;
}
static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag)
@@ -3424,9 +3421,9 @@ static inline int tcp_may_update_window(const struct tcp_sock *tp,
const u32 ack, const u32 ack_seq,
const u32 nwin)
{
- return (after(ack, tp->snd_una) ||
+ return after(ack, tp->snd_una) ||
after(ack_seq, tp->snd_wl1) ||
- (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd));
+ (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd);
}
/* Update our send window.
@@ -3858,6 +3855,7 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
}
}
}
+EXPORT_SYMBOL(tcp_parse_options);
static int tcp_parse_aligned_timestamp(struct tcp_sock *tp, struct tcphdr *th)
{
@@ -3924,13 +3922,14 @@ u8 *tcp_parse_md5sig_option(struct tcphdr *th)
if (opsize < 2 || opsize > length)
return NULL;
if (opcode == TCPOPT_MD5SIG)
- return ptr;
+ return opsize == TCPOLEN_MD5SIG ? ptr : NULL;
}
ptr += opsize - 2;
length -= opsize;
}
return NULL;
}
+EXPORT_SYMBOL(tcp_parse_md5sig_option);
#endif
static inline void tcp_store_ts_recent(struct tcp_sock *tp)
@@ -4041,6 +4040,8 @@ static void tcp_reset(struct sock *sk)
default:
sk->sk_err = ECONNRESET;
}
+ /* This barrier is coupled with smp_rmb() in tcp_poll() */
+ smp_wmb();
if (!sock_flag(sk, SOCK_DEAD))
sk->sk_error_report(sk);
@@ -5432,6 +5433,7 @@ discard:
__kfree_skb(skb);
return 0;
}
+EXPORT_SYMBOL(tcp_rcv_established);
static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
struct tcphdr *th, unsigned len)
@@ -5931,14 +5933,4 @@ discard:
}
return 0;
}
-
-EXPORT_SYMBOL(sysctl_tcp_ecn);
-EXPORT_SYMBOL(sysctl_tcp_reordering);
-EXPORT_SYMBOL(sysctl_tcp_adv_win_scale);
-EXPORT_SYMBOL(tcp_parse_options);
-#ifdef CONFIG_TCP_MD5SIG
-EXPORT_SYMBOL(tcp_parse_md5sig_option);
-#endif
-EXPORT_SYMBOL(tcp_rcv_established);
EXPORT_SYMBOL(tcp_rcv_state_process);
-EXPORT_SYMBOL(tcp_initialize_rcv_mss);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index fe193e53af44..8f8527d41682 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -84,6 +84,7 @@
int sysctl_tcp_tw_reuse __read_mostly;
int sysctl_tcp_low_latency __read_mostly;
+EXPORT_SYMBOL(sysctl_tcp_low_latency);
#ifdef CONFIG_TCP_MD5SIG
@@ -100,6 +101,7 @@ struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
#endif
struct inet_hashinfo tcp_hashinfo;
+EXPORT_SYMBOL(tcp_hashinfo);
static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
{
@@ -139,7 +141,6 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
return 0;
}
-
EXPORT_SYMBOL_GPL(tcp_twsk_unique);
/* This will initiate an outgoing connection. */
@@ -204,10 +205,12 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
* TIME-WAIT * and initialize rx_opt.ts_recent from it,
* when trying new connection.
*/
- if (peer != NULL &&
- (u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
- tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
- tp->rx_opt.ts_recent = peer->tcp_ts;
+ if (peer) {
+ inet_peer_refcheck(peer);
+ if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
+ tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
+ tp->rx_opt.ts_recent = peer->tcp_ts;
+ }
}
}
@@ -237,7 +240,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
/* OK, now commit destination to socket. */
sk->sk_gso_type = SKB_GSO_TCPV4;
- sk_setup_caps(sk, &rt->u.dst);
+ sk_setup_caps(sk, &rt->dst);
if (!tp->write_seq)
tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
@@ -265,6 +268,7 @@ failure:
inet->inet_dport = 0;
return err;
}
+EXPORT_SYMBOL(tcp_v4_connect);
/*
* This routine does path mtu discovery as defined in RFC1191.
@@ -543,6 +547,7 @@ void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
}
+EXPORT_SYMBOL(tcp_v4_send_check);
int tcp_v4_gso_send_check(struct sk_buff *skb)
{
@@ -793,19 +798,20 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req)
kfree(inet_rsk(req)->opt);
}
-#ifdef CONFIG_SYN_COOKIES
-static void syn_flood_warning(struct sk_buff *skb)
+static void syn_flood_warning(const struct sk_buff *skb)
{
- static unsigned long warntime;
+ const char *msg;
- if (time_after(jiffies, (warntime + HZ * 60))) {
- warntime = jiffies;
- printk(KERN_INFO
- "possible SYN flooding on port %d. Sending cookies.\n",
- ntohs(tcp_hdr(skb)->dest));
- }
-}
+#ifdef CONFIG_SYN_COOKIES
+ if (sysctl_tcp_syncookies)
+ msg = "Sending cookies";
+ else
#endif
+ msg = "Dropping request";
+
+ pr_info("TCP: Possible SYN flooding on port %d. %s.\n",
+ ntohs(tcp_hdr(skb)->dest), msg);
+}
/*
* Save and compile IPv4 options into the request_sock if needed.
@@ -857,7 +863,6 @@ struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
{
return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr);
}
-
EXPORT_SYMBOL(tcp_v4_md5_lookup);
static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
@@ -924,7 +929,6 @@ int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
}
return 0;
}
-
EXPORT_SYMBOL(tcp_v4_md5_do_add);
static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
@@ -962,7 +966,6 @@ int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
}
return -ENOENT;
}
-
EXPORT_SYMBOL(tcp_v4_md5_do_del);
static void tcp_v4_clear_md5_list(struct sock *sk)
@@ -1135,7 +1138,6 @@ clear_hash_noput:
memset(md5_hash, 0, 16);
return 1;
}
-
EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
@@ -1243,6 +1245,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
* evidently real one.
*/
if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
+ if (net_ratelimit())
+ syn_flood_warning(skb);
#ifdef CONFIG_SYN_COOKIES
if (sysctl_tcp_syncookies) {
want_cookie = 1;
@@ -1323,15 +1327,12 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
if (security_inet_conn_request(sk, skb, req))
goto drop_and_free;
- if (!want_cookie)
+ if (!want_cookie || tmp_opt.tstamp_ok)
TCP_ECN_create_request(req, tcp_hdr(skb));
if (want_cookie) {
-#ifdef CONFIG_SYN_COOKIES
- syn_flood_warning(skb);
- req->cookie_ts = tmp_opt.tstamp_ok;
-#endif
isn = cookie_v4_init_sequence(sk, skb, &req->mss);
+ req->cookie_ts = tmp_opt.tstamp_ok;
} else if (!isn) {
struct inet_peer *peer = NULL;
@@ -1349,6 +1350,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
(dst = inet_csk_route_req(sk, req)) != NULL &&
(peer = rt_get_peer((struct rtable *)dst)) != NULL &&
peer->v4daddr == saddr) {
+ inet_peer_refcheck(peer);
if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
(s32)(peer->tcp_ts - req->ts_recent) >
TCP_PAWS_WINDOW) {
@@ -1393,6 +1395,7 @@ drop_and_free:
drop:
return 0;
}
+EXPORT_SYMBOL(tcp_v4_conn_request);
/*
@@ -1419,7 +1422,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
newsk = tcp_create_openreq_child(sk, req, skb);
if (!newsk)
- goto exit;
+ goto exit_nonewsk;
newsk->sk_gso_type = SKB_GSO_TCPV4;
sk_setup_caps(newsk, dst);
@@ -1466,18 +1469,23 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
}
#endif
+ if (__inet_inherit_port(sk, newsk) < 0) {
+ sock_put(newsk);
+ goto exit;
+ }
__inet_hash_nolisten(newsk, NULL);
- __inet_inherit_port(sk, newsk);
return newsk;
exit_overflow:
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+exit_nonewsk:
+ dst_release(dst);
exit:
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
- dst_release(dst);
return NULL;
}
+EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
{
@@ -1504,7 +1512,7 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
}
#ifdef CONFIG_SYN_COOKIES
- if (!th->rst && !th->syn && th->ack)
+ if (!th->syn)
sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
#endif
return sk;
@@ -1607,6 +1615,7 @@ csum_err:
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
goto discard;
}
+EXPORT_SYMBOL(tcp_v4_do_rcv);
/*
* From tcp_input.c
@@ -1793,6 +1802,7 @@ int tcp_v4_remember_stamp(struct sock *sk)
return 0;
}
+EXPORT_SYMBOL(tcp_v4_remember_stamp);
int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
{
@@ -1832,6 +1842,7 @@ const struct inet_connection_sock_af_ops ipv4_specific = {
.compat_getsockopt = compat_ip_getsockopt,
#endif
};
+EXPORT_SYMBOL(ipv4_specific);
#ifdef CONFIG_TCP_MD5SIG
static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
@@ -1960,7 +1971,6 @@ void tcp_v4_destroy_sock(struct sock *sk)
percpu_counter_dec(&tcp_sockets_allocated);
}
-
EXPORT_SYMBOL(tcp_v4_destroy_sock);
#ifdef CONFIG_PROC_FS
@@ -1978,6 +1988,11 @@ static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
}
+/*
+ * Get next listener socket follow cur. If cur is NULL, get first socket
+ * starting from bucket given in st->bucket; when st->bucket is zero the
+ * very first socket in the hash table is returned.
+ */
static void *listening_get_next(struct seq_file *seq, void *cur)
{
struct inet_connection_sock *icsk;
@@ -1988,14 +2003,15 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
struct net *net = seq_file_net(seq);
if (!sk) {
- st->bucket = 0;
- ilb = &tcp_hashinfo.listening_hash[0];
+ ilb = &tcp_hashinfo.listening_hash[st->bucket];
spin_lock_bh(&ilb->lock);
sk = sk_nulls_head(&ilb->head);
+ st->offset = 0;
goto get_sk;
}
ilb = &tcp_hashinfo.listening_hash[st->bucket];
++st->num;
+ ++st->offset;
if (st->state == TCP_SEQ_STATE_OPENREQ) {
struct request_sock *req = cur;
@@ -2010,6 +2026,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
}
req = req->dl_next;
}
+ st->offset = 0;
if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
break;
get_req:
@@ -2045,6 +2062,7 @@ start_req:
read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
}
spin_unlock_bh(&ilb->lock);
+ st->offset = 0;
if (++st->bucket < INET_LHTABLE_SIZE) {
ilb = &tcp_hashinfo.listening_hash[st->bucket];
spin_lock_bh(&ilb->lock);
@@ -2058,7 +2076,12 @@ out:
static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
{
- void *rc = listening_get_next(seq, NULL);
+ struct tcp_iter_state *st = seq->private;
+ void *rc;
+
+ st->bucket = 0;
+ st->offset = 0;
+ rc = listening_get_next(seq, NULL);
while (rc && *pos) {
rc = listening_get_next(seq, rc);
@@ -2073,13 +2096,18 @@ static inline int empty_bucket(struct tcp_iter_state *st)
hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
}
+/*
+ * Get first established socket starting from bucket given in st->bucket.
+ * If st->bucket is zero, the very first socket in the hash is returned.
+ */
static void *established_get_first(struct seq_file *seq)
{
struct tcp_iter_state *st = seq->private;
struct net *net = seq_file_net(seq);
void *rc = NULL;
- for (st->bucket = 0; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
+ st->offset = 0;
+ for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
struct sock *sk;
struct hlist_nulls_node *node;
struct inet_timewait_sock *tw;
@@ -2124,6 +2152,7 @@ static void *established_get_next(struct seq_file *seq, void *cur)
struct net *net = seq_file_net(seq);
++st->num;
+ ++st->offset;
if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
tw = cur;
@@ -2140,6 +2169,7 @@ get_tw:
st->state = TCP_SEQ_STATE_ESTABLISHED;
/* Look for next non empty bucket */
+ st->offset = 0;
while (++st->bucket <= tcp_hashinfo.ehash_mask &&
empty_bucket(st))
;
@@ -2167,7 +2197,11 @@ out:
static void *established_get_idx(struct seq_file *seq, loff_t pos)
{
- void *rc = established_get_first(seq);
+ struct tcp_iter_state *st = seq->private;
+ void *rc;
+
+ st->bucket = 0;
+ rc = established_get_first(seq);
while (rc && pos) {
rc = established_get_next(seq, rc);
@@ -2192,24 +2226,72 @@ static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
return rc;
}
+static void *tcp_seek_last_pos(struct seq_file *seq)
+{
+ struct tcp_iter_state *st = seq->private;
+ int offset = st->offset;
+ int orig_num = st->num;
+ void *rc = NULL;
+
+ switch (st->state) {
+ case TCP_SEQ_STATE_OPENREQ:
+ case TCP_SEQ_STATE_LISTENING:
+ if (st->bucket >= INET_LHTABLE_SIZE)
+ break;
+ st->state = TCP_SEQ_STATE_LISTENING;
+ rc = listening_get_next(seq, NULL);
+ while (offset-- && rc)
+ rc = listening_get_next(seq, rc);
+ if (rc)
+ break;
+ st->bucket = 0;
+ /* Fallthrough */
+ case TCP_SEQ_STATE_ESTABLISHED:
+ case TCP_SEQ_STATE_TIME_WAIT:
+ st->state = TCP_SEQ_STATE_ESTABLISHED;
+ if (st->bucket > tcp_hashinfo.ehash_mask)
+ break;
+ rc = established_get_first(seq);
+ while (offset-- && rc)
+ rc = established_get_next(seq, rc);
+ }
+
+ st->num = orig_num;
+
+ return rc;
+}
+
static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
{
struct tcp_iter_state *st = seq->private;
+ void *rc;
+
+ if (*pos && *pos == st->last_pos) {
+ rc = tcp_seek_last_pos(seq);
+ if (rc)
+ goto out;
+ }
+
st->state = TCP_SEQ_STATE_LISTENING;
st->num = 0;
- return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+ st->bucket = 0;
+ st->offset = 0;
+ rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+
+out:
+ st->last_pos = *pos;
+ return rc;
}
static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
+ struct tcp_iter_state *st = seq->private;
void *rc = NULL;
- struct tcp_iter_state *st;
if (v == SEQ_START_TOKEN) {
rc = tcp_get_idx(seq, 0);
goto out;
}
- st = seq->private;
switch (st->state) {
case TCP_SEQ_STATE_OPENREQ:
@@ -2217,6 +2299,8 @@ static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
rc = listening_get_next(seq, v);
if (!rc) {
st->state = TCP_SEQ_STATE_ESTABLISHED;
+ st->bucket = 0;
+ st->offset = 0;
rc = established_get_first(seq);
}
break;
@@ -2227,6 +2311,7 @@ static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
}
out:
++*pos;
+ st->last_pos = *pos;
return rc;
}
@@ -2265,6 +2350,7 @@ static int tcp_seq_open(struct inode *inode, struct file *file)
s = ((struct seq_file *)file->private_data)->private;
s->family = afinfo->family;
+ s->last_pos = 0;
return 0;
}
@@ -2288,11 +2374,13 @@ int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
rc = -ENOMEM;
return rc;
}
+EXPORT_SYMBOL(tcp_proc_register);
void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
{
proc_net_remove(net, afinfo->name);
}
+EXPORT_SYMBOL(tcp_proc_unregister);
static void get_openreq4(struct sock *sk, struct request_sock *req,
struct seq_file *f, int i, int uid, int *len)
@@ -2487,7 +2575,6 @@ struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
return tcp_gro_receive(head, skb);
}
-EXPORT_SYMBOL(tcp4_gro_receive);
int tcp4_gro_complete(struct sk_buff *skb)
{
@@ -2500,7 +2587,6 @@ int tcp4_gro_complete(struct sk_buff *skb)
return tcp_gro_complete(skb);
}
-EXPORT_SYMBOL(tcp4_gro_complete);
struct proto tcp_prot = {
.name = "TCP",
@@ -2516,6 +2602,8 @@ struct proto tcp_prot = {
.setsockopt = tcp_setsockopt,
.getsockopt = tcp_getsockopt,
.recvmsg = tcp_recvmsg,
+ .sendmsg = tcp_sendmsg,
+ .sendpage = tcp_sendpage,
.backlog_rcv = tcp_v4_do_rcv,
.hash = inet_hash,
.unhash = inet_unhash,
@@ -2534,11 +2622,13 @@ struct proto tcp_prot = {
.twsk_prot = &tcp_timewait_sock_ops,
.rsk_prot = &tcp_request_sock_ops,
.h.hashinfo = &tcp_hashinfo,
+ .no_autobind = true,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_tcp_setsockopt,
.compat_getsockopt = compat_tcp_getsockopt,
#endif
};
+EXPORT_SYMBOL(tcp_prot);
static int __net_init tcp_sk_init(struct net *net)
@@ -2569,20 +2659,3 @@ void __init tcp_v4_init(void)
if (register_pernet_subsys(&tcp_sk_ops))
panic("Failed to create the TCP control socket.\n");
}
-
-EXPORT_SYMBOL(ipv4_specific);
-EXPORT_SYMBOL(tcp_hashinfo);
-EXPORT_SYMBOL(tcp_prot);
-EXPORT_SYMBOL(tcp_v4_conn_request);
-EXPORT_SYMBOL(tcp_v4_connect);
-EXPORT_SYMBOL(tcp_v4_do_rcv);
-EXPORT_SYMBOL(tcp_v4_remember_stamp);
-EXPORT_SYMBOL(tcp_v4_send_check);
-EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
-
-#ifdef CONFIG_PROC_FS
-EXPORT_SYMBOL(tcp_proc_register);
-EXPORT_SYMBOL(tcp_proc_unregister);
-#endif
-EXPORT_SYMBOL(sysctl_tcp_low_latency);
-
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 794c2e122a41..43cf901d7659 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -47,7 +47,6 @@ struct inet_timewait_death_row tcp_death_row = {
.twcal_timer = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0,
(unsigned long)&tcp_death_row),
};
-
EXPORT_SYMBOL_GPL(tcp_death_row);
static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
@@ -56,7 +55,7 @@ static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
return 1;
if (after(end_seq, s_win) && before(seq, e_win))
return 1;
- return (seq == e_win && seq == end_seq);
+ return seq == e_win && seq == end_seq;
}
/*
@@ -262,6 +261,7 @@ kill:
inet_twsk_put(tw);
return TCP_TW_SUCCESS;
}
+EXPORT_SYMBOL(tcp_timewait_state_process);
/*
* Move a socket to time-wait or dead fin-wait-2 state.
@@ -362,7 +362,6 @@ void tcp_twsk_destructor(struct sock *sk)
tcp_free_md5sig_pool();
#endif
}
-
EXPORT_SYMBOL_GPL(tcp_twsk_destructor);
static inline void TCP_ECN_openreq_child(struct tcp_sock *tp,
@@ -510,6 +509,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
}
return newsk;
}
+EXPORT_SYMBOL(tcp_create_openreq_child);
/*
* Process an incoming packet for SYN_RECV sockets represented
@@ -706,6 +706,7 @@ embryonic_reset:
inet_csk_reqsk_queue_drop(sk, req, prev);
return NULL;
}
+EXPORT_SYMBOL(tcp_check_req);
/*
* Queue segment on the new socket if the new socket is active,
@@ -737,8 +738,4 @@ int tcp_child_process(struct sock *parent, struct sock *child,
sock_put(child);
return ret;
}
-
-EXPORT_SYMBOL(tcp_check_req);
EXPORT_SYMBOL(tcp_child_process);
-EXPORT_SYMBOL(tcp_create_openreq_child);
-EXPORT_SYMBOL(tcp_timewait_state_process);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 7ed9dc1042d1..05b1ecf36763 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -224,16 +224,10 @@ void tcp_select_initial_window(int __space, __u32 mss,
}
}
- /* Set initial window to value enough for senders,
- * following RFC2414. Senders, not following this RFC,
- * will be satisfied with 2.
- */
+ /* Set initial window to value enough for senders, following RFC5681. */
if (mss > (1 << *rcv_wscale)) {
- int init_cwnd = 4;
- if (mss > 1460 * 3)
- init_cwnd = 2;
- else if (mss > 1460)
- init_cwnd = 3;
+ int init_cwnd = rfc3390_bytes_to_packets(mss);
+
/* when initializing use the value from init_rcv_wnd
* rather than the default from above
*/
@@ -247,6 +241,7 @@ void tcp_select_initial_window(int __space, __u32 mss,
/* Set the clamp no higher than max representable value */
(*window_clamp) = min(65535U << (*rcv_wscale), *window_clamp);
}
+EXPORT_SYMBOL(tcp_select_initial_window);
/* Chose a new window to advertise, update state in tcp_sock for the
* socket, and return result with RFC1323 scaling applied. The return
@@ -294,9 +289,9 @@ static u16 tcp_select_window(struct sock *sk)
/* Packet ECN state for a SYN-ACK */
static inline void TCP_ECN_send_synack(struct tcp_sock *tp, struct sk_buff *skb)
{
- TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_CWR;
+ TCP_SKB_CB(skb)->flags &= ~TCPHDR_CWR;
if (!(tp->ecn_flags & TCP_ECN_OK))
- TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_ECE;
+ TCP_SKB_CB(skb)->flags &= ~TCPHDR_ECE;
}
/* Packet ECN state for a SYN. */
@@ -306,7 +301,7 @@ static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb)
tp->ecn_flags = 0;
if (sysctl_tcp_ecn == 1) {
- TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ECE | TCPCB_FLAG_CWR;
+ TCP_SKB_CB(skb)->flags |= TCPHDR_ECE | TCPHDR_CWR;
tp->ecn_flags = TCP_ECN_OK;
}
}
@@ -361,7 +356,7 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
skb_shinfo(skb)->gso_type = 0;
TCP_SKB_CB(skb)->seq = seq;
- if (flags & (TCPCB_FLAG_SYN | TCPCB_FLAG_FIN))
+ if (flags & (TCPHDR_SYN | TCPHDR_FIN))
seq++;
TCP_SKB_CB(skb)->end_seq = seq;
}
@@ -820,7 +815,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
tcb = TCP_SKB_CB(skb);
memset(&opts, 0, sizeof(opts));
- if (unlikely(tcb->flags & TCPCB_FLAG_SYN))
+ if (unlikely(tcb->flags & TCPHDR_SYN))
tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
else
tcp_options_size = tcp_established_options(sk, skb, &opts,
@@ -843,7 +838,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
*(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) |
tcb->flags);
- if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {
+ if (unlikely(tcb->flags & TCPHDR_SYN)) {
/* RFC1323: The window in SYN & SYN/ACK segments
* is never scaled.
*/
@@ -866,7 +861,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
}
tcp_options_write((__be32 *)(th + 1), tp, &opts);
- if (likely((tcb->flags & TCPCB_FLAG_SYN) == 0))
+ if (likely((tcb->flags & TCPHDR_SYN) == 0))
TCP_ECN_send(sk, skb, tcp_header_size);
#ifdef CONFIG_TCP_MD5SIG
@@ -880,7 +875,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
icsk->icsk_af_ops->send_check(sk, skb);
- if (likely(tcb->flags & TCPCB_FLAG_ACK))
+ if (likely(tcb->flags & TCPHDR_ACK))
tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
if (skb->len != tcp_header_size)
@@ -1023,7 +1018,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
/* PSH and FIN should only be set in the second packet. */
flags = TCP_SKB_CB(skb)->flags;
- TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN | TCPCB_FLAG_PSH);
+ TCP_SKB_CB(skb)->flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
TCP_SKB_CB(buff)->flags = flags;
TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked;
@@ -1189,6 +1184,7 @@ void tcp_mtup_init(struct sock *sk)
icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, sysctl_tcp_base_mss);
icsk->icsk_mtup.probe_size = 0;
}
+EXPORT_SYMBOL(tcp_mtup_init);
/* This function synchronize snd mss to current pmtu/exthdr set.
@@ -1232,6 +1228,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
return mss_now;
}
+EXPORT_SYMBOL(tcp_sync_mss);
/* Compute the current effective MSS, taking SACKs and IP options,
* and even PMTU discovery events into account.
@@ -1328,8 +1325,7 @@ static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp,
u32 in_flight, cwnd;
/* Don't be strict about the congestion window for the final FIN. */
- if ((TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) &&
- tcp_skb_pcount(skb) == 1)
+ if ((TCP_SKB_CB(skb)->flags & TCPHDR_FIN) && tcp_skb_pcount(skb) == 1)
return 1;
in_flight = tcp_packets_in_flight(tp);
@@ -1374,9 +1370,9 @@ static inline int tcp_nagle_check(const struct tcp_sock *tp,
const struct sk_buff *skb,
unsigned mss_now, int nonagle)
{
- return (skb->len < mss_now &&
+ return skb->len < mss_now &&
((nonagle & TCP_NAGLE_CORK) ||
- (!nonagle && tp->packets_out && tcp_minshall_check(tp))));
+ (!nonagle && tp->packets_out && tcp_minshall_check(tp)));
}
/* Return non-zero if the Nagle test allows this packet to be
@@ -1398,7 +1394,7 @@ static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb,
* Nagle can be ignored during F-RTO too (see RFC4138).
*/
if (tcp_urg_mode(tp) || (tp->frto_counter == 2) ||
- (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN))
+ (TCP_SKB_CB(skb)->flags & TCPHDR_FIN))
return 1;
if (!tcp_nagle_check(tp, skb, cur_mss, nonagle))
@@ -1447,10 +1443,10 @@ int tcp_may_send_now(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb = tcp_send_head(sk);
- return (skb &&
+ return skb &&
tcp_snd_test(sk, skb, tcp_current_mss(sk),
(tcp_skb_is_last(sk, skb) ?
- tp->nonagle : TCP_NAGLE_PUSH)));
+ tp->nonagle : TCP_NAGLE_PUSH));
}
/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
@@ -1461,7 +1457,7 @@ int tcp_may_send_now(struct sock *sk)
* packet has never been sent out before (and thus is not cloned).
*/
static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
- unsigned int mss_now)
+ unsigned int mss_now, gfp_t gfp)
{
struct sk_buff *buff;
int nlen = skb->len - len;
@@ -1471,7 +1467,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
if (skb->len != skb->data_len)
return tcp_fragment(sk, skb, len, mss_now);
- buff = sk_stream_alloc_skb(sk, 0, GFP_ATOMIC);
+ buff = sk_stream_alloc_skb(sk, 0, gfp);
if (unlikely(buff == NULL))
return -ENOMEM;
@@ -1487,7 +1483,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
/* PSH and FIN should only be set in the second packet. */
flags = TCP_SKB_CB(skb)->flags;
- TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN | TCPCB_FLAG_PSH);
+ TCP_SKB_CB(skb)->flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
TCP_SKB_CB(buff)->flags = flags;
/* This packet was never sent out yet, so no SACK bits. */
@@ -1518,7 +1514,7 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
const struct inet_connection_sock *icsk = inet_csk(sk);
u32 send_win, cong_win, limit, in_flight;
- if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
+ if (TCP_SKB_CB(skb)->flags & TCPHDR_FIN)
goto send_now;
if (icsk->icsk_ca_state != TCP_CA_Open)
@@ -1644,7 +1640,7 @@ static int tcp_mtu_probe(struct sock *sk)
TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
- TCP_SKB_CB(nskb)->flags = TCPCB_FLAG_ACK;
+ TCP_SKB_CB(nskb)->flags = TCPHDR_ACK;
TCP_SKB_CB(nskb)->sacked = 0;
nskb->csum = 0;
nskb->ip_summed = skb->ip_summed;
@@ -1669,7 +1665,7 @@ static int tcp_mtu_probe(struct sock *sk)
sk_wmem_free_skb(sk, skb);
} else {
TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags &
- ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
+ ~(TCPHDR_FIN|TCPHDR_PSH);
if (!skb_shinfo(skb)->nr_frags) {
skb_pull(skb, copy);
if (skb->ip_summed != CHECKSUM_PARTIAL)
@@ -1769,7 +1765,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
cwnd_quota);
if (skb->len > limit &&
- unlikely(tso_fragment(sk, skb, limit, mss_now)))
+ unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
break;
TCP_SKB_CB(skb)->when = tcp_time_stamp;
@@ -2020,7 +2016,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
if (!sysctl_tcp_retrans_collapse)
return;
- if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN)
+ if (TCP_SKB_CB(skb)->flags & TCPHDR_SYN)
return;
tcp_for_write_queue_from_safe(skb, tmp, sk) {
@@ -2112,7 +2108,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
* since it is cheap to do so and saves bytes on the network.
*/
if (skb->len > 0 &&
- (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) &&
+ (TCP_SKB_CB(skb)->flags & TCPHDR_FIN) &&
tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) {
if (!pskb_trim(skb, 0)) {
/* Reuse, even though it does some unnecessary work */
@@ -2304,7 +2300,7 @@ void tcp_send_fin(struct sock *sk)
mss_now = tcp_current_mss(sk);
if (tcp_send_head(sk) != NULL) {
- TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN;
+ TCP_SKB_CB(skb)->flags |= TCPHDR_FIN;
TCP_SKB_CB(skb)->end_seq++;
tp->write_seq++;
} else {
@@ -2321,7 +2317,7 @@ void tcp_send_fin(struct sock *sk)
skb_reserve(skb, MAX_TCP_HEADER);
/* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
tcp_init_nondata_skb(skb, tp->write_seq,
- TCPCB_FLAG_ACK | TCPCB_FLAG_FIN);
+ TCPHDR_ACK | TCPHDR_FIN);
tcp_queue_skb(sk, skb);
}
__tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF);
@@ -2346,7 +2342,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority)
/* Reserve space for headers and prepare control bits. */
skb_reserve(skb, MAX_TCP_HEADER);
tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk),
- TCPCB_FLAG_ACK | TCPCB_FLAG_RST);
+ TCPHDR_ACK | TCPHDR_RST);
/* Send it off. */
TCP_SKB_CB(skb)->when = tcp_time_stamp;
if (tcp_transmit_skb(sk, skb, 0, priority))
@@ -2366,11 +2362,11 @@ int tcp_send_synack(struct sock *sk)
struct sk_buff *skb;
skb = tcp_write_queue_head(sk);
- if (skb == NULL || !(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN)) {
+ if (skb == NULL || !(TCP_SKB_CB(skb)->flags & TCPHDR_SYN)) {
printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n");
return -EFAULT;
}
- if (!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_ACK)) {
+ if (!(TCP_SKB_CB(skb)->flags & TCPHDR_ACK)) {
if (skb_cloned(skb)) {
struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
if (nskb == NULL)
@@ -2384,7 +2380,7 @@ int tcp_send_synack(struct sock *sk)
skb = nskb;
}
- TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ACK;
+ TCP_SKB_CB(skb)->flags |= TCPHDR_ACK;
TCP_ECN_send_synack(tcp_sk(sk), skb);
}
TCP_SKB_CB(skb)->when = tcp_time_stamp;
@@ -2427,6 +2423,12 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
__u8 rcv_wscale;
/* Set this up on the first call only */
req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
+
+ /* limit the window selection if the user enforce a smaller rx buffer */
+ if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
+ (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0))
+ req->window_clamp = tcp_full_space(sk);
+
/* tcp_full_space because it is guaranteed to be the first packet */
tcp_select_initial_window(tcp_full_space(sk),
mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
@@ -2463,7 +2465,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
* not even correctly set)
*/
tcp_init_nondata_skb(skb, tcp_rsk(req)->snt_isn,
- TCPCB_FLAG_SYN | TCPCB_FLAG_ACK);
+ TCPHDR_SYN | TCPHDR_ACK);
if (OPTION_COOKIE_EXTENSION & opts.options) {
if (s_data_desired) {
@@ -2518,6 +2520,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
return skb;
}
+EXPORT_SYMBOL(tcp_make_synack);
/* Do all connect socket setups that can be done AF independent. */
static void tcp_connect_init(struct sock *sk)
@@ -2552,6 +2555,11 @@ static void tcp_connect_init(struct sock *sk)
tcp_initialize_rcv_mss(sk);
+ /* limit the window selection if the user enforce a smaller rx buffer */
+ if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
+ (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0))
+ tp->window_clamp = tcp_full_space(sk);
+
tcp_select_initial_window(tcp_full_space(sk),
tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
&tp->rcv_wnd,
@@ -2595,7 +2603,7 @@ int tcp_connect(struct sock *sk)
skb_reserve(buff, MAX_TCP_HEADER);
tp->snd_nxt = tp->write_seq;
- tcp_init_nondata_skb(buff, tp->write_seq++, TCPCB_FLAG_SYN);
+ tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
TCP_ECN_send_syn(sk, buff);
/* Send it off. */
@@ -2620,6 +2628,7 @@ int tcp_connect(struct sock *sk)
inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
return 0;
}
+EXPORT_SYMBOL(tcp_connect);
/* Send out a delayed ack, the caller does the policy checking
* to see if we should even be here. See tcp_input.c:tcp_ack_snd_check()
@@ -2701,7 +2710,7 @@ void tcp_send_ack(struct sock *sk)
/* Reserve space for headers and prepare control bits. */
skb_reserve(buff, MAX_TCP_HEADER);
- tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPCB_FLAG_ACK);
+ tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK);
/* Send it off, this clears delayed acks for us. */
TCP_SKB_CB(buff)->when = tcp_time_stamp;
@@ -2735,7 +2744,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
* end to send an ack. Don't queue or clone SKB, just
* send it.
*/
- tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPCB_FLAG_ACK);
+ tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK);
TCP_SKB_CB(skb)->when = tcp_time_stamp;
return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC);
}
@@ -2765,13 +2774,13 @@ int tcp_write_wakeup(struct sock *sk)
if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
skb->len > mss) {
seg_size = min(seg_size, mss);
- TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
+ TCP_SKB_CB(skb)->flags |= TCPHDR_PSH;
if (tcp_fragment(sk, skb, seg_size, mss))
return -1;
} else if (!tcp_skb_pcount(skb))
tcp_set_skb_tso_segs(sk, skb, mss);
- TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
+ TCP_SKB_CB(skb)->flags |= TCPHDR_PSH;
TCP_SKB_CB(skb)->when = tcp_time_stamp;
err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
if (!err)
@@ -2824,10 +2833,3 @@ void tcp_send_probe0(struct sock *sk)
TCP_RTO_MAX);
}
}
-
-EXPORT_SYMBOL(tcp_select_initial_window);
-EXPORT_SYMBOL(tcp_connect);
-EXPORT_SYMBOL(tcp_make_synack);
-EXPORT_SYMBOL(tcp_simple_retransmit);
-EXPORT_SYMBOL(tcp_sync_mss);
-EXPORT_SYMBOL(tcp_mtup_init);
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
index f8efada580e8..6211e2114173 100644
--- a/net/ipv4/tcp_probe.c
+++ b/net/ipv4/tcp_probe.c
@@ -214,6 +214,7 @@ static const struct file_operations tcpprobe_fops = {
.owner = THIS_MODULE,
.open = tcpprobe_open,
.read = tcpprobe_read,
+ .llseek = noop_llseek,
};
static __init int tcpprobe_init(void)
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 440a5c6004f6..74a6aa003657 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -41,7 +41,6 @@ void tcp_init_xmit_timers(struct sock *sk)
inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
&tcp_keepalive_timer);
}
-
EXPORT_SYMBOL(tcp_init_xmit_timers);
static void tcp_write_err(struct sock *sk)
@@ -67,18 +66,18 @@ static void tcp_write_err(struct sock *sk)
static int tcp_out_of_resources(struct sock *sk, int do_reset)
{
struct tcp_sock *tp = tcp_sk(sk);
- int orphans = percpu_counter_read_positive(&tcp_orphan_count);
+ int shift = 0;
/* If peer does not open window for long time, or did not transmit
* anything for long time, penalize it. */
if ((s32)(tcp_time_stamp - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset)
- orphans <<= 1;
+ shift++;
/* If some dubious ICMP arrived, penalize even more. */
if (sk->sk_err_soft)
- orphans <<= 1;
+ shift++;
- if (tcp_too_many_orphans(sk, orphans)) {
+ if (tcp_too_many_orphans(sk, shift)) {
if (net_ratelimit())
printk(KERN_INFO "Out of socket memory\n");
@@ -136,13 +135,16 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
/* This function calculates a "timeout" which is equivalent to the timeout of a
* TCP connection after "boundary" unsuccessful, exponentially backed-off
- * retransmissions with an initial RTO of TCP_RTO_MIN.
+ * retransmissions with an initial RTO of TCP_RTO_MIN or TCP_TIMEOUT_INIT if
+ * syn_set flag is set.
*/
static bool retransmits_timed_out(struct sock *sk,
- unsigned int boundary)
+ unsigned int boundary,
+ unsigned int timeout,
+ bool syn_set)
{
- unsigned int timeout, linear_backoff_thresh;
- unsigned int start_ts;
+ unsigned int linear_backoff_thresh, start_ts;
+ unsigned int rto_base = syn_set ? TCP_TIMEOUT_INIT : TCP_RTO_MIN;
if (!inet_csk(sk)->icsk_retransmits)
return false;
@@ -152,14 +154,15 @@ static bool retransmits_timed_out(struct sock *sk,
else
start_ts = tcp_sk(sk)->retrans_stamp;
- linear_backoff_thresh = ilog2(TCP_RTO_MAX/TCP_RTO_MIN);
-
- if (boundary <= linear_backoff_thresh)
- timeout = ((2 << boundary) - 1) * TCP_RTO_MIN;
- else
- timeout = ((2 << linear_backoff_thresh) - 1) * TCP_RTO_MIN +
- (boundary - linear_backoff_thresh) * TCP_RTO_MAX;
+ if (likely(timeout == 0)) {
+ linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base);
+ if (boundary <= linear_backoff_thresh)
+ timeout = ((2 << boundary) - 1) * rto_base;
+ else
+ timeout = ((2 << linear_backoff_thresh) - 1) * rto_base +
+ (boundary - linear_backoff_thresh) * TCP_RTO_MAX;
+ }
return (tcp_time_stamp - start_ts) >= timeout;
}
@@ -168,14 +171,15 @@ static int tcp_write_timeout(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
int retry_until;
- bool do_reset;
+ bool do_reset, syn_set = 0;
if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
if (icsk->icsk_retransmits)
dst_negative_advice(sk);
retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
+ syn_set = 1;
} else {
- if (retransmits_timed_out(sk, sysctl_tcp_retries1)) {
+ if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0, 0)) {
/* Black hole detection */
tcp_mtu_probing(icsk, sk);
@@ -188,14 +192,15 @@ static int tcp_write_timeout(struct sock *sk)
retry_until = tcp_orphan_retries(sk, alive);
do_reset = alive ||
- !retransmits_timed_out(sk, retry_until);
+ !retransmits_timed_out(sk, retry_until, 0, 0);
if (tcp_out_of_resources(sk, do_reset))
return 1;
}
}
- if (retransmits_timed_out(sk, retry_until)) {
+ if (retransmits_timed_out(sk, retry_until,
+ syn_set ? 0 : icsk->icsk_user_timeout, syn_set)) {
/* Has it gone just too far? */
tcp_write_err(sk);
return 1;
@@ -362,18 +367,19 @@ void tcp_retransmit_timer(struct sock *sk)
if (icsk->icsk_retransmits == 0) {
int mib_idx;
- if (icsk->icsk_ca_state == TCP_CA_Disorder) {
- if (tcp_is_sack(tp))
- mib_idx = LINUX_MIB_TCPSACKFAILURES;
- else
- mib_idx = LINUX_MIB_TCPRENOFAILURES;
- } else if (icsk->icsk_ca_state == TCP_CA_Recovery) {
+ if (icsk->icsk_ca_state == TCP_CA_Recovery) {
if (tcp_is_sack(tp))
mib_idx = LINUX_MIB_TCPSACKRECOVERYFAIL;
else
mib_idx = LINUX_MIB_TCPRENORECOVERYFAIL;
} else if (icsk->icsk_ca_state == TCP_CA_Loss) {
mib_idx = LINUX_MIB_TCPLOSSFAILURES;
+ } else if ((icsk->icsk_ca_state == TCP_CA_Disorder) ||
+ tp->sacked_out) {
+ if (tcp_is_sack(tp))
+ mib_idx = LINUX_MIB_TCPSACKFAILURES;
+ else
+ mib_idx = LINUX_MIB_TCPRENOFAILURES;
} else {
mib_idx = LINUX_MIB_TCPTIMEOUTS;
}
@@ -437,7 +443,7 @@ out_reset_timer:
icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
}
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
- if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1))
+ if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0, 0))
__sk_dst_reset(sk);
out:;
@@ -557,7 +563,14 @@ static void tcp_keepalive_timer (unsigned long data)
elapsed = keepalive_time_elapsed(tp);
if (elapsed >= keepalive_time_when(tp)) {
- if (icsk->icsk_probes_out >= keepalive_probes(tp)) {
+ /* If the TCP_USER_TIMEOUT option is enabled, use that
+ * to determine when to timeout instead.
+ */
+ if ((icsk->icsk_user_timeout != 0 &&
+ elapsed >= icsk->icsk_user_timeout &&
+ icsk->icsk_probes_out > 0) ||
+ (icsk->icsk_user_timeout == 0 &&
+ icsk->icsk_probes_out >= keepalive_probes(tp))) {
tcp_send_active_reset(sk, GFP_ATOMIC);
tcp_write_err(sk);
goto out;
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index b612acf76183..38bc0b52d745 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -6,7 +6,7 @@
* "TCP Veno: TCP Enhancement for Transmission over Wireless Access Networks."
* IEEE Journal on Selected Areas in Communication,
* Feb. 2003.
- * See http://www.ntu.edu.sg/home5/ZHOU0022/papers/CPFu03a.pdf
+ * See http://www.ie.cuhk.edu.hk/fileadmin/staff_upload/soung/Journal/J3.pdf
*/
#include <linux/mm.h>
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index 20151d6a6241..a534dda5456e 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -80,7 +80,7 @@ static void tcp_westwood_init(struct sock *sk)
*/
static inline u32 westwood_do_filter(u32 a, u32 b)
{
- return (((7 * a) + b) >> 3);
+ return ((7 * a) + b) >> 3;
}
static void westwood_filter(struct westwood *w, u32 delta)
diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c
index 3b3813cc80b9..ac3b3ee4b07c 100644
--- a/net/ipv4/tunnel4.c
+++ b/net/ipv4/tunnel4.c
@@ -14,32 +14,37 @@
#include <net/protocol.h>
#include <net/xfrm.h>
-static struct xfrm_tunnel *tunnel4_handlers;
-static struct xfrm_tunnel *tunnel64_handlers;
+static struct xfrm_tunnel __rcu *tunnel4_handlers __read_mostly;
+static struct xfrm_tunnel __rcu *tunnel64_handlers __read_mostly;
static DEFINE_MUTEX(tunnel4_mutex);
-static inline struct xfrm_tunnel **fam_handlers(unsigned short family)
+static inline struct xfrm_tunnel __rcu **fam_handlers(unsigned short family)
{
return (family == AF_INET) ? &tunnel4_handlers : &tunnel64_handlers;
}
int xfrm4_tunnel_register(struct xfrm_tunnel *handler, unsigned short family)
{
- struct xfrm_tunnel **pprev;
+ struct xfrm_tunnel __rcu **pprev;
+ struct xfrm_tunnel *t;
+
int ret = -EEXIST;
int priority = handler->priority;
mutex_lock(&tunnel4_mutex);
- for (pprev = fam_handlers(family); *pprev; pprev = &(*pprev)->next) {
- if ((*pprev)->priority > priority)
+ for (pprev = fam_handlers(family);
+ (t = rcu_dereference_protected(*pprev,
+ lockdep_is_held(&tunnel4_mutex))) != NULL;
+ pprev = &t->next) {
+ if (t->priority > priority)
break;
- if ((*pprev)->priority == priority)
+ if (t->priority == priority)
goto err;
}
handler->next = *pprev;
- *pprev = handler;
+ rcu_assign_pointer(*pprev, handler);
ret = 0;
@@ -48,18 +53,21 @@ err:
return ret;
}
-
EXPORT_SYMBOL(xfrm4_tunnel_register);
int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family)
{
- struct xfrm_tunnel **pprev;
+ struct xfrm_tunnel __rcu **pprev;
+ struct xfrm_tunnel *t;
int ret = -ENOENT;
mutex_lock(&tunnel4_mutex);
- for (pprev = fam_handlers(family); *pprev; pprev = &(*pprev)->next) {
- if (*pprev == handler) {
+ for (pprev = fam_handlers(family);
+ (t = rcu_dereference_protected(*pprev,
+ lockdep_is_held(&tunnel4_mutex))) != NULL;
+ pprev = &t->next) {
+ if (t == handler) {
*pprev = handler->next;
ret = 0;
break;
@@ -72,9 +80,13 @@ int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family)
return ret;
}
-
EXPORT_SYMBOL(xfrm4_tunnel_deregister);
+#define for_each_tunnel_rcu(head, handler) \
+ for (handler = rcu_dereference(head); \
+ handler != NULL; \
+ handler = rcu_dereference(handler->next)) \
+
static int tunnel4_rcv(struct sk_buff *skb)
{
struct xfrm_tunnel *handler;
@@ -82,7 +94,7 @@ static int tunnel4_rcv(struct sk_buff *skb)
if (!pskb_may_pull(skb, sizeof(struct iphdr)))
goto drop;
- for (handler = tunnel4_handlers; handler; handler = handler->next)
+ for_each_tunnel_rcu(tunnel4_handlers, handler)
if (!handler->handler(skb))
return 0;
@@ -101,7 +113,7 @@ static int tunnel64_rcv(struct sk_buff *skb)
if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
goto drop;
- for (handler = tunnel64_handlers; handler; handler = handler->next)
+ for_each_tunnel_rcu(tunnel64_handlers, handler)
if (!handler->handler(skb))
return 0;
@@ -117,7 +129,7 @@ static void tunnel4_err(struct sk_buff *skb, u32 info)
{
struct xfrm_tunnel *handler;
- for (handler = tunnel4_handlers; handler; handler = handler->next)
+ for_each_tunnel_rcu(tunnel4_handlers, handler)
if (!handler->err_handler(skb, info))
break;
}
@@ -127,7 +139,7 @@ static void tunnel64_err(struct sk_buff *skb, u32 info)
{
struct xfrm_tunnel *handler;
- for (handler = tunnel64_handlers; handler; handler = handler->next)
+ for_each_tunnel_rcu(tunnel64_handlers, handler)
if (!handler->err_handler(skb, info))
break;
}
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index eec4ff456e33..28cb2d733a3c 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -797,7 +797,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
return -EOPNOTSUPP;
ipc.opt = NULL;
- ipc.shtx.flags = 0;
+ ipc.tx_flags = 0;
if (up->pending) {
/*
@@ -845,7 +845,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
ipc.addr = inet->inet_saddr;
ipc.oif = sk->sk_bound_dev_if;
- err = sock_tx_timestamp(msg, sk, &ipc.shtx);
+ err = sock_tx_timestamp(sk, &ipc.tx_flags);
if (err)
return err;
if (msg->msg_controllen) {
@@ -914,7 +914,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
!sock_flag(sk, SOCK_BROADCAST))
goto out;
if (connected)
- sk_dst_set(sk, dst_clone(&rt->u.dst));
+ sk_dst_set(sk, dst_clone(&rt->dst));
}
if (msg->msg_flags&MSG_CONFIRM)
@@ -978,7 +978,7 @@ out:
return err;
do_confirm:
- dst_confirm(&rt->u.dst);
+ dst_confirm(&rt->dst);
if (!(msg->msg_flags&MSG_PROBE) || len)
goto back_from_confirm;
err = 0;
@@ -1260,6 +1260,49 @@ void udp_lib_unhash(struct sock *sk)
}
EXPORT_SYMBOL(udp_lib_unhash);
+/*
+ * inet_rcv_saddr was changed, we must rehash secondary hash
+ */
+void udp_lib_rehash(struct sock *sk, u16 newhash)
+{
+ if (sk_hashed(sk)) {
+ struct udp_table *udptable = sk->sk_prot->h.udp_table;
+ struct udp_hslot *hslot, *hslot2, *nhslot2;
+
+ hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
+ nhslot2 = udp_hashslot2(udptable, newhash);
+ udp_sk(sk)->udp_portaddr_hash = newhash;
+ if (hslot2 != nhslot2) {
+ hslot = udp_hashslot(udptable, sock_net(sk),
+ udp_sk(sk)->udp_port_hash);
+ /* we must lock primary chain too */
+ spin_lock_bh(&hslot->lock);
+
+ spin_lock(&hslot2->lock);
+ hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
+ hslot2->count--;
+ spin_unlock(&hslot2->lock);
+
+ spin_lock(&nhslot2->lock);
+ hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
+ &nhslot2->head);
+ nhslot2->count++;
+ spin_unlock(&nhslot2->lock);
+
+ spin_unlock_bh(&hslot->lock);
+ }
+ }
+}
+EXPORT_SYMBOL(udp_lib_rehash);
+
+static void udp_v4_rehash(struct sock *sk)
+{
+ u16 new_hash = udp4_portaddr_hash(sock_net(sk),
+ inet_sk(sk)->inet_rcv_saddr,
+ inet_sk(sk)->inet_num);
+ udp_lib_rehash(sk, new_hash);
+}
+
static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
int rc;
@@ -1370,7 +1413,7 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
}
}
- if (sk->sk_filter) {
+ if (rcu_dereference_raw(sk->sk_filter)) {
if (udp_lib_checksum_complete(skb))
goto drop;
}
@@ -1843,6 +1886,7 @@ struct proto udp_prot = {
.backlog_rcv = __udp_queue_rcv_skb,
.hash = udp_lib_hash,
.unhash = udp_lib_unhash,
+ .rehash = udp_v4_rehash,
.get_port = udp_v4_get_port,
.memory_allocated = &udp_memory_allocated,
.sysctl_mem = sysctl_udp_mem,
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index 6610bf76369f..ab76aa928fa9 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -58,6 +58,7 @@ struct proto udplite_prot = {
.compat_getsockopt = compat_udp_getsockopt,
#endif
};
+EXPORT_SYMBOL(udplite_prot);
static struct inet_protosw udplite4_protosw = {
.type = SOCK_DGRAM,
@@ -127,5 +128,3 @@ out_unregister_proto:
out_register_err:
printk(KERN_CRIT "%s: Cannot add UDP-Lite protocol.\n", __func__);
}
-
-EXPORT_SYMBOL(udplite_prot);
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index ad8fbb871aa0..06814b6216dc 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -163,5 +163,4 @@ int xfrm4_rcv(struct sk_buff *skb)
{
return xfrm4_rcv_spi(skb, ip_hdr(skb)->protocol, 0);
}
-
EXPORT_SYMBOL(xfrm4_rcv);
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 23883a48ebfb..4464f3bff6a7 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -37,7 +37,7 @@ static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos,
fl.fl4_src = saddr->a4;
err = __ip_route_output_key(net, &rt, &fl);
- dst = &rt->u.dst;
+ dst = &rt->dst;
if (err)
dst = ERR_PTR(err);
return dst;
@@ -61,7 +61,7 @@ static int xfrm4_get_saddr(struct net *net,
static int xfrm4_get_tos(struct flowi *fl)
{
- return fl->fl4_tos;
+ return IPTOS_RT_MASK & fl->fl4_tos; /* Strip ECN bits */
}
static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst,
@@ -174,7 +174,7 @@ static inline int xfrm4_garbage_collect(struct dst_ops *ops)
struct net *net = container_of(ops, struct net, xfrm.xfrm4_dst_ops);
xfrm4_policy_afinfo.garbage_collect(net);
- return (atomic_read(&ops->entries) > ops->gc_thresh * 2);
+ return (dst_entries_get_slow(ops) > ops->gc_thresh * 2);
}
static void xfrm4_update_pmtu(struct dst_entry *dst, u32 mtu)
@@ -232,7 +232,6 @@ static struct dst_ops xfrm4_dst_ops = {
.ifdown = xfrm4_dst_ifdown,
.local_out = __ip_local_out,
.gc_thresh = 1024,
- .entries = ATOMIC_INIT(0),
};
static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
@@ -288,6 +287,7 @@ void __init xfrm4_init(int rt_max_size)
* and start cleaning when were 1/2 full
*/
xfrm4_dst_ops.gc_thresh = rt_max_size/2;
+ dst_entries_init(&xfrm4_dst_ops);
xfrm4_state_init();
xfrm4_policy_init();
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index 1ef1366a0a03..47947624eccc 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -21,21 +21,25 @@ static int xfrm4_init_flags(struct xfrm_state *x)
}
static void
-__xfrm4_init_tempsel(struct xfrm_state *x, struct flowi *fl,
- struct xfrm_tmpl *tmpl,
- xfrm_address_t *daddr, xfrm_address_t *saddr)
+__xfrm4_init_tempsel(struct xfrm_selector *sel, struct flowi *fl)
+{
+ sel->daddr.a4 = fl->fl4_dst;
+ sel->saddr.a4 = fl->fl4_src;
+ sel->dport = xfrm_flowi_dport(fl);
+ sel->dport_mask = htons(0xffff);
+ sel->sport = xfrm_flowi_sport(fl);
+ sel->sport_mask = htons(0xffff);
+ sel->family = AF_INET;
+ sel->prefixlen_d = 32;
+ sel->prefixlen_s = 32;
+ sel->proto = fl->proto;
+ sel->ifindex = fl->oif;
+}
+
+static void
+xfrm4_init_temprop(struct xfrm_state *x, struct xfrm_tmpl *tmpl,
+ xfrm_address_t *daddr, xfrm_address_t *saddr)
{
- x->sel.daddr.a4 = fl->fl4_dst;
- x->sel.saddr.a4 = fl->fl4_src;
- x->sel.dport = xfrm_flowi_dport(fl);
- x->sel.dport_mask = htons(0xffff);
- x->sel.sport = xfrm_flowi_sport(fl);
- x->sel.sport_mask = htons(0xffff);
- x->sel.family = AF_INET;
- x->sel.prefixlen_d = 32;
- x->sel.prefixlen_s = 32;
- x->sel.proto = fl->proto;
- x->sel.ifindex = fl->oif;
x->id = tmpl->id;
if (x->id.daddr.a4 == 0)
x->id.daddr.a4 = daddr->a4;
@@ -70,6 +74,7 @@ static struct xfrm_state_afinfo xfrm4_state_afinfo = {
.owner = THIS_MODULE,
.init_flags = xfrm4_init_flags,
.init_tempsel = __xfrm4_init_tempsel,
+ .init_temprop = xfrm4_init_temprop,
.output = xfrm4_output,
.extract_input = xfrm4_extract_input,
.extract_output = xfrm4_extract_output,
diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c
index 41f5982d2087..82806455e859 100644
--- a/net/ipv4/xfrm4_tunnel.c
+++ b/net/ipv4/xfrm4_tunnel.c
@@ -58,14 +58,14 @@ static int xfrm_tunnel_err(struct sk_buff *skb, u32 info)
return -ENOENT;
}
-static struct xfrm_tunnel xfrm_tunnel_handler = {
+static struct xfrm_tunnel xfrm_tunnel_handler __read_mostly = {
.handler = xfrm_tunnel_rcv,
.err_handler = xfrm_tunnel_err,
.priority = 2,
};
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-static struct xfrm_tunnel xfrm64_tunnel_handler = {
+static struct xfrm_tunnel xfrm64_tunnel_handler __read_mostly = {
.handler = xfrm_tunnel_rcv,
.err_handler = xfrm_tunnel_err,
.priority = 2,