diff options
author | Michal Marek <mmarek@suse.cz> | 2010-12-14 22:01:55 +0100 |
---|---|---|
committer | Michal Marek <mmarek@suse.cz> | 2010-12-14 22:01:55 +0100 |
commit | 8990c1bc4be46473ad19bf2fa612ca57286f3df4 (patch) | |
tree | 3cea60576903a1d26c67e6ec62891b524d390e95 /net/ipv4 | |
parent | 2979076fbf17a0947d6eba367b0cac19c907c160 (diff) | |
parent | c8ddb2713c624f432fa5fe3c7ecffcdda46ea0d4 (diff) | |
download | linux-8990c1bc4be46473ad19bf2fa612ca57286f3df4.tar.gz linux-8990c1bc4be46473ad19bf2fa612ca57286f3df4.tar.bz2 linux-8990c1bc4be46473ad19bf2fa612ca57286f3df4.zip |
Merge commit 'v2.6.37-rc1' into kbuild/kbuild
Diffstat (limited to 'net/ipv4')
85 files changed, 3399 insertions, 2519 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 7c3a7d191249..9e95d7fb6d5a 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -46,7 +46,7 @@ config IP_ADVANCED_ROUTER rp_filter on use: echo 1 > /proc/sys/net/ipv4/conf/<device>/rp_filter - and + or echo 1 > /proc/sys/net/ipv4/conf/all/rp_filter Note that some distributions enable it in startup scripts. @@ -84,7 +84,7 @@ config IP_FIB_TRIE An experimental study of compression methods for dynamic tries Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002. - http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/ + <http://www.csc.kth.se/~snilsson/software/dyntrie2/> endchoice @@ -215,8 +215,15 @@ config NET_IPIP be inserted in and removed from the running kernel whenever you want). Most people won't need this and can say N. +config NET_IPGRE_DEMUX + tristate "IP: GRE demultiplexer" + help + This is helper module to demultiplex GRE packets on GRE version field criteria. + Required by ip_gre and pptp modules. + config NET_IPGRE tristate "IP: GRE tunnels over IP" + depends on (IPV6 || IPV6=n) && NET_IPGRE_DEMUX help Tunneling means encapsulating data of one protocol type within another protocol and sending it over a channel that understands the @@ -412,7 +419,7 @@ config INET_XFRM_MODE_BEET If unsure, say Y. config INET_LRO - bool "Large Receive Offload (ipv4/tcp)" + tristate "Large Receive Offload (ipv4/tcp)" default y ---help--- Support for Large Receive Offload (ipv4/tcp). @@ -555,7 +562,7 @@ config TCP_CONG_VENO distinguishing to circumvent the difficult judgment of the packet loss type. TCP Veno cuts down less congestion window in response to random loss packets. - See http://www.ntu.edu.sg/home5/ZHOU0022/papers/CPFu03a.pdf + See <http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=1177186> config TCP_CONG_YEAH tristate "YeAH TCP" diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 80ff87ce43aa..4978d22f9a75 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -20,6 +20,7 @@ obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o obj-$(CONFIG_IP_MROUTE) += ipmr.o obj-$(CONFIG_NET_IPIP) += ipip.o +obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o obj-$(CONFIG_NET_IPGRE) += ip_gre.o obj-$(CONFIG_SYN_COOKIES) += syncookies.o obj-$(CONFIG_INET_AH) += ah4.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 551ce564b035..f581f77d1097 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -227,18 +227,16 @@ EXPORT_SYMBOL(inet_ehash_secret); /* * inet_ehash_secret must be set exactly once - * Instead of using a dedicated spinlock, we (ab)use inetsw_lock */ void build_ehash_secret(void) { u32 rnd; + do { get_random_bytes(&rnd, sizeof(rnd)); } while (rnd == 0); - spin_lock_bh(&inetsw_lock); - if (!inet_ehash_secret) - inet_ehash_secret = rnd; - spin_unlock_bh(&inetsw_lock); + + cmpxchg(&inet_ehash_secret, 0, rnd); } EXPORT_SYMBOL(build_ehash_secret); @@ -355,6 +353,8 @@ lookup_protocol: inet = inet_sk(sk); inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0; + inet->nodefrag = 0; + if (SOCK_RAW == sock->type) { inet->inet_num = protocol; if (IPPROTO_RAW == protocol) @@ -725,28 +725,31 @@ int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, sock_rps_record_flow(sk); /* We may need to bind the socket. */ - if (!inet_sk(sk)->inet_num && inet_autobind(sk)) + if (!inet_sk(sk)->inet_num && !sk->sk_prot->no_autobind && + inet_autobind(sk)) return -EAGAIN; return sk->sk_prot->sendmsg(iocb, sk, msg, size); } EXPORT_SYMBOL(inet_sendmsg); -static ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, - size_t size, int flags) +ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, + size_t size, int flags) { struct sock *sk = sock->sk; sock_rps_record_flow(sk); /* We may need to bind the socket. */ - if (!inet_sk(sk)->inet_num && inet_autobind(sk)) + if (!inet_sk(sk)->inet_num && !sk->sk_prot->no_autobind && + inet_autobind(sk)) return -EAGAIN; if (sk->sk_prot->sendpage) return sk->sk_prot->sendpage(sk, page, offset, size, flags); return sock_no_sendpage(sock, page, offset, size, flags); } +EXPORT_SYMBOL(inet_sendpage); int inet_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t size, int flags) @@ -892,10 +895,10 @@ const struct proto_ops inet_stream_ops = { .shutdown = inet_shutdown, .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, - .sendmsg = tcp_sendmsg, + .sendmsg = inet_sendmsg, .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, - .sendpage = tcp_sendpage, + .sendpage = inet_sendpage, .splice_read = tcp_splice_read, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_sock_common_setsockopt, @@ -1100,7 +1103,7 @@ static int inet_sk_reselect_saddr(struct sock *sk) if (err) return err; - sk_setup_caps(sk, &rt->u.dst); + sk_setup_caps(sk, &rt->dst); new_saddr = rt->rt_src; @@ -1166,7 +1169,7 @@ int inet_sk_rebuild_header(struct sock *sk) err = ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0); } if (!err) - sk_setup_caps(sk, &rt->u.dst); + sk_setup_caps(sk, &rt->dst); else { /* Routing failed... */ sk->sk_route_caps = 0; @@ -1425,13 +1428,49 @@ unsigned long snmp_fold_field(void __percpu *mib[], int offt) } EXPORT_SYMBOL_GPL(snmp_fold_field); -int snmp_mib_init(void __percpu *ptr[2], size_t mibsize) +#if BITS_PER_LONG==32 + +u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t syncp_offset) +{ + u64 res = 0; + int cpu; + + for_each_possible_cpu(cpu) { + void *bhptr, *userptr; + struct u64_stats_sync *syncp; + u64 v_bh, v_user; + unsigned int start; + + /* first mib used by softirq context, we must use _bh() accessors */ + bhptr = per_cpu_ptr(SNMP_STAT_BHPTR(mib), cpu); + syncp = (struct u64_stats_sync *)(bhptr + syncp_offset); + do { + start = u64_stats_fetch_begin_bh(syncp); + v_bh = *(((u64 *) bhptr) + offt); + } while (u64_stats_fetch_retry_bh(syncp, start)); + + /* second mib used in USER context */ + userptr = per_cpu_ptr(SNMP_STAT_USRPTR(mib), cpu); + syncp = (struct u64_stats_sync *)(userptr + syncp_offset); + do { + start = u64_stats_fetch_begin(syncp); + v_user = *(((u64 *) userptr) + offt); + } while (u64_stats_fetch_retry(syncp, start)); + + res += v_bh + v_user; + } + return res; +} +EXPORT_SYMBOL_GPL(snmp_fold_field64); +#endif + +int snmp_mib_init(void __percpu *ptr[2], size_t mibsize, size_t align) { BUG_ON(ptr == NULL); - ptr[0] = __alloc_percpu(mibsize, __alignof__(unsigned long)); + ptr[0] = __alloc_percpu(mibsize, align); if (!ptr[0]) goto err0; - ptr[1] = __alloc_percpu(mibsize, __alignof__(unsigned long)); + ptr[1] = __alloc_percpu(mibsize, align); if (!ptr[1]) goto err1; return 0; @@ -1488,25 +1527,32 @@ static const struct net_protocol icmp_protocol = { static __net_init int ipv4_mib_init_net(struct net *net) { if (snmp_mib_init((void __percpu **)net->mib.tcp_statistics, - sizeof(struct tcp_mib)) < 0) + sizeof(struct tcp_mib), + __alignof__(struct tcp_mib)) < 0) goto err_tcp_mib; if (snmp_mib_init((void __percpu **)net->mib.ip_statistics, - sizeof(struct ipstats_mib)) < 0) + sizeof(struct ipstats_mib), + __alignof__(struct ipstats_mib)) < 0) goto err_ip_mib; if (snmp_mib_init((void __percpu **)net->mib.net_statistics, - sizeof(struct linux_mib)) < 0) + sizeof(struct linux_mib), + __alignof__(struct linux_mib)) < 0) goto err_net_mib; if (snmp_mib_init((void __percpu **)net->mib.udp_statistics, - sizeof(struct udp_mib)) < 0) + sizeof(struct udp_mib), + __alignof__(struct udp_mib)) < 0) goto err_udp_mib; if (snmp_mib_init((void __percpu **)net->mib.udplite_statistics, - sizeof(struct udp_mib)) < 0) + sizeof(struct udp_mib), + __alignof__(struct udp_mib)) < 0) goto err_udplite_mib; if (snmp_mib_init((void __percpu **)net->mib.icmp_statistics, - sizeof(struct icmp_mib)) < 0) + sizeof(struct icmp_mib), + __alignof__(struct icmp_mib)) < 0) goto err_icmp_mib; if (snmp_mib_init((void __percpu **)net->mib.icmpmsg_statistics, - sizeof(struct icmpmsg_mib)) < 0) + sizeof(struct icmpmsg_mib), + __alignof__(struct icmpmsg_mib)) < 0) goto err_icmpmsg_mib; tcp_mib_init(net); diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index f094b75810db..d8e540c5b071 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -55,7 +55,7 @@ * Stuart Cheshire : Metricom and grat arp fixes * *** FOR 2.1 clean this up *** * Lawrence V. Stefani: (08/12/96) Added FDDI support. - * Alan Cox : Took the AP1000 nasty FDDI hack and + * Alan Cox : Took the AP1000 nasty FDDI hack and * folded into the mainstream FDDI code. * Ack spit, Linus how did you allow that * one in... @@ -116,17 +116,18 @@ #if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE) #include <net/atmclip.h> struct neigh_table *clip_tbl_hook; +EXPORT_SYMBOL(clip_tbl_hook); #endif #include <asm/system.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/netfilter_arp.h> /* * Interface to generic neighbour cache. */ -static u32 arp_hash(const void *pkey, const struct net_device *dev); +static u32 arp_hash(const void *pkey, const struct net_device *dev, __u32 rnd); static int arp_constructor(struct neighbour *neigh); static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb); static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb); @@ -160,7 +161,7 @@ static const struct neigh_ops arp_direct_ops = { .queue_xmit = dev_queue_xmit, }; -const struct neigh_ops arp_broken_ops = { +static const struct neigh_ops arp_broken_ops = { .family = AF_INET, .solicit = arp_solicit, .error_report = arp_error_report, @@ -171,33 +172,34 @@ const struct neigh_ops arp_broken_ops = { }; struct neigh_table arp_tbl = { - .family = AF_INET, - .entry_size = sizeof(struct neighbour) + 4, - .key_len = 4, - .hash = arp_hash, - .constructor = arp_constructor, - .proxy_redo = parp_redo, - .id = "arp_cache", - .parms = { - .tbl = &arp_tbl, - .base_reachable_time = 30 * HZ, - .retrans_time = 1 * HZ, - .gc_staletime = 60 * HZ, - .reachable_time = 30 * HZ, - .delay_probe_time = 5 * HZ, - .queue_len = 3, - .ucast_probes = 3, - .mcast_probes = 3, - .anycast_delay = 1 * HZ, - .proxy_delay = (8 * HZ) / 10, - .proxy_qlen = 64, - .locktime = 1 * HZ, + .family = AF_INET, + .entry_size = sizeof(struct neighbour) + 4, + .key_len = 4, + .hash = arp_hash, + .constructor = arp_constructor, + .proxy_redo = parp_redo, + .id = "arp_cache", + .parms = { + .tbl = &arp_tbl, + .base_reachable_time = 30 * HZ, + .retrans_time = 1 * HZ, + .gc_staletime = 60 * HZ, + .reachable_time = 30 * HZ, + .delay_probe_time = 5 * HZ, + .queue_len = 3, + .ucast_probes = 3, + .mcast_probes = 3, + .anycast_delay = 1 * HZ, + .proxy_delay = (8 * HZ) / 10, + .proxy_qlen = 64, + .locktime = 1 * HZ, }, - .gc_interval = 30 * HZ, - .gc_thresh1 = 128, - .gc_thresh2 = 512, - .gc_thresh3 = 1024, + .gc_interval = 30 * HZ, + .gc_thresh1 = 128, + .gc_thresh2 = 512, + .gc_thresh3 = 1024, }; +EXPORT_SYMBOL(arp_tbl); int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir) { @@ -223,14 +225,16 @@ int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir) } -static u32 arp_hash(const void *pkey, const struct net_device *dev) +static u32 arp_hash(const void *pkey, + const struct net_device *dev, + __u32 hash_rnd) { - return jhash_2words(*(u32 *)pkey, dev->ifindex, arp_tbl.hash_rnd); + return jhash_2words(*(u32 *)pkey, dev->ifindex, hash_rnd); } static int arp_constructor(struct neighbour *neigh) { - __be32 addr = *(__be32*)neigh->primary_key; + __be32 addr = *(__be32 *)neigh->primary_key; struct net_device *dev = neigh->dev; struct in_device *in_dev; struct neigh_parms *parms; @@ -293,16 +297,19 @@ static int arp_constructor(struct neighbour *neigh) neigh->ops = &arp_broken_ops; neigh->output = neigh->ops->output; return 0; +#else + break; #endif - ;} + } #endif if (neigh->type == RTN_MULTICAST) { neigh->nud_state = NUD_NOARP; arp_mc_map(addr, neigh->ha, dev, 1); - } else if (dev->flags&(IFF_NOARP|IFF_LOOPBACK)) { + } else if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) { neigh->nud_state = NUD_NOARP; memcpy(neigh->ha, dev->dev_addr, dev->addr_len); - } else if (neigh->type == RTN_BROADCAST || dev->flags&IFF_POINTOPOINT) { + } else if (neigh->type == RTN_BROADCAST || + (dev->flags & IFF_POINTOPOINT)) { neigh->nud_state = NUD_NOARP; memcpy(neigh->ha, dev->broadcast, dev->addr_len); } @@ -312,7 +319,7 @@ static int arp_constructor(struct neighbour *neigh) else neigh->ops = &arp_generic_ops; - if (neigh->nud_state&NUD_VALID) + if (neigh->nud_state & NUD_VALID) neigh->output = neigh->ops->connected_output; else neigh->output = neigh->ops->output; @@ -331,17 +338,21 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) __be32 saddr = 0; u8 *dst_ha = NULL; struct net_device *dev = neigh->dev; - __be32 target = *(__be32*)neigh->primary_key; + __be32 target = *(__be32 *)neigh->primary_key; int probes = atomic_read(&neigh->probes); - struct in_device *in_dev = in_dev_get(dev); + struct in_device *in_dev; - if (!in_dev) + rcu_read_lock(); + in_dev = __in_dev_get_rcu(dev); + if (!in_dev) { + rcu_read_unlock(); return; - + } switch (IN_DEV_ARP_ANNOUNCE(in_dev)) { default: case 0: /* By default announce any local IP */ - if (skb && inet_addr_type(dev_net(dev), ip_hdr(skb)->saddr) == RTN_LOCAL) + if (skb && inet_addr_type(dev_net(dev), + ip_hdr(skb)->saddr) == RTN_LOCAL) saddr = ip_hdr(skb)->saddr; break; case 1: /* Restrict announcements of saddr in same subnet */ @@ -358,22 +369,26 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) case 2: /* Avoid secondary IPs, get a primary/preferred one */ break; } + rcu_read_unlock(); - if (in_dev) - in_dev_put(in_dev); if (!saddr) saddr = inet_select_addr(dev, target, RT_SCOPE_LINK); - if ((probes -= neigh->parms->ucast_probes) < 0) { - if (!(neigh->nud_state&NUD_VALID)) - printk(KERN_DEBUG "trying to ucast probe in NUD_INVALID\n"); + probes -= neigh->parms->ucast_probes; + if (probes < 0) { + if (!(neigh->nud_state & NUD_VALID)) + printk(KERN_DEBUG + "trying to ucast probe in NUD_INVALID\n"); dst_ha = neigh->ha; read_lock_bh(&neigh->lock); - } else if ((probes -= neigh->parms->app_probes) < 0) { + } else { + probes -= neigh->parms->app_probes; + if (probes < 0) { #ifdef CONFIG_ARPD - neigh_app_ns(neigh); + neigh_app_ns(neigh); #endif - return; + return; + } } arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr, @@ -427,7 +442,7 @@ static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev) if (ip_route_output_key(net, &rt, &fl) < 0) return 1; - if (rt->u.dst.dev != dev) { + if (rt->dst.dev != dev) { NET_INC_STATS_BH(net, LINUX_MIB_ARPFILTER); flag = 1; } @@ -446,7 +461,8 @@ static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev) * is allowed to use this function, it is scheduled to be removed. --ANK */ -static int arp_set_predefined(int addr_hint, unsigned char * haddr, __be32 paddr, struct net_device * dev) +static int arp_set_predefined(int addr_hint, unsigned char *haddr, + __be32 paddr, struct net_device *dev) { switch (addr_hint) { case RTN_LOCAL: @@ -478,17 +494,16 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb) paddr = skb_rtable(skb)->rt_gateway; - if (arp_set_predefined(inet_addr_type(dev_net(dev), paddr), haddr, paddr, dev)) + if (arp_set_predefined(inet_addr_type(dev_net(dev), paddr), haddr, + paddr, dev)) return 0; n = __neigh_lookup(&arp_tbl, &paddr, dev, 1); if (n) { n->used = jiffies; - if (n->nud_state&NUD_VALID || neigh_event_send(n, skb) == 0) { - read_lock_bh(&n->lock); - memcpy(haddr, n->ha, dev->addr_len); - read_unlock_bh(&n->lock); + if (n->nud_state & NUD_VALID || neigh_event_send(n, skb) == 0) { + neigh_ha_snapshot(haddr, n, dev); neigh_release(n); return 0; } @@ -497,6 +512,7 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb) kfree_skb(skb); return 1; } +EXPORT_SYMBOL(arp_find); /* END OF OBSOLETE FUNCTIONS */ @@ -509,13 +525,14 @@ int arp_bind_neighbour(struct dst_entry *dst) return -EINVAL; if (n == NULL) { __be32 nexthop = ((struct rtable *)dst)->rt_gateway; - if (dev->flags&(IFF_LOOPBACK|IFF_POINTOPOINT)) + if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) nexthop = 0; n = __neigh_lookup_errno( #if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE) - dev->type == ARPHRD_ATM ? clip_tbl_hook : + dev->type == ARPHRD_ATM ? + clip_tbl_hook : #endif - &arp_tbl, &nexthop, dev); + &arp_tbl, &nexthop, dev); if (IS_ERR(n)) return PTR_ERR(n); dst->neighbour = n; @@ -532,24 +549,24 @@ static inline int arp_fwd_proxy(struct in_device *in_dev, struct in_device *out_dev; int imi, omi = -1; - if (rt->u.dst.dev == dev) + if (rt->dst.dev == dev) return 0; if (!IN_DEV_PROXY_ARP(in_dev)) return 0; - - if ((imi = IN_DEV_MEDIUM_ID(in_dev)) == 0) + imi = IN_DEV_MEDIUM_ID(in_dev); + if (imi == 0) return 1; if (imi == -1) return 0; /* place to check for proxy_arp for routes */ - if ((out_dev = in_dev_get(rt->u.dst.dev)) != NULL) { + out_dev = __in_dev_get_rcu(rt->dst.dev); + if (out_dev) omi = IN_DEV_MEDIUM_ID(out_dev); - in_dev_put(out_dev); - } - return (omi != imi && omi != -1); + + return omi != imi && omi != -1; } /* @@ -576,7 +593,7 @@ static inline int arp_fwd_pvlan(struct in_device *in_dev, __be32 sip, __be32 tip) { /* Private VLAN is only concerned about the same ethernet segment */ - if (rt->u.dst.dev != dev) + if (rt->dst.dev != dev) return 0; /* Don't reply on self probes (often done by windowz boxes)*/ @@ -679,7 +696,7 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip, arp->ar_pln = 4; arp->ar_op = htons(type); - arp_ptr=(unsigned char *)(arp+1); + arp_ptr = (unsigned char *)(arp + 1); memcpy(arp_ptr, src_hw, dev->addr_len); arp_ptr += dev->addr_len; @@ -698,6 +715,7 @@ out: kfree_skb(skb); return NULL; } +EXPORT_SYMBOL(arp_create); /* * Send an arp packet. @@ -707,6 +725,7 @@ void arp_xmit(struct sk_buff *skb) /* Send it off, maybe filter it using firewalling first. */ NF_HOOK(NFPROTO_ARP, NF_ARP_OUT, skb, NULL, skb->dev, dev_queue_xmit); } +EXPORT_SYMBOL(arp_xmit); /* * Create and send an arp packet. @@ -727,12 +746,12 @@ void arp_send(int type, int ptype, __be32 dest_ip, skb = arp_create(type, ptype, dest_ip, dev, src_ip, dest_hw, src_hw, target_hw); - if (skb == NULL) { + if (skb == NULL) return; - } arp_xmit(skb); } +EXPORT_SYMBOL(arp_send); /* * Process an arp request. @@ -741,7 +760,7 @@ void arp_send(int type, int ptype, __be32 dest_ip, static int arp_process(struct sk_buff *skb) { struct net_device *dev = skb->dev; - struct in_device *in_dev = in_dev_get(dev); + struct in_device *in_dev = __in_dev_get_rcu(dev); struct arphdr *arp; unsigned char *arp_ptr; struct rtable *rt; @@ -806,7 +825,7 @@ static int arp_process(struct sk_buff *skb) /* * Extract fields */ - arp_ptr= (unsigned char *)(arp+1); + arp_ptr = (unsigned char *)(arp + 1); sha = arp_ptr; arp_ptr += dev->addr_len; memcpy(&sip, arp_ptr, 4); @@ -860,16 +879,17 @@ static int arp_process(struct sk_buff *skb) addr_type = rt->rt_type; if (addr_type == RTN_LOCAL) { - int dont_send = 0; + int dont_send; - if (!dont_send) - dont_send |= arp_ignore(in_dev,sip,tip); + dont_send = arp_ignore(in_dev, sip, tip); if (!dont_send && IN_DEV_ARPFILTER(in_dev)) - dont_send |= arp_filter(sip,tip,dev); + dont_send |= arp_filter(sip, tip, dev); if (!dont_send) { n = neigh_event_ns(&arp_tbl, sha, &sip, dev); if (n) { - arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha); + arp_send(ARPOP_REPLY, ETH_P_ARP, sip, + dev, tip, sha, dev->dev_addr, + sha); neigh_release(n); } } @@ -878,8 +898,7 @@ static int arp_process(struct sk_buff *skb) if (addr_type == RTN_UNICAST && (arp_fwd_proxy(in_dev, dev, rt) || arp_fwd_pvlan(in_dev, dev, rt, sip, tip) || - pneigh_lookup(&arp_tbl, net, &tip, dev, 0))) - { + pneigh_lookup(&arp_tbl, net, &tip, dev, 0))) { n = neigh_event_ns(&arp_tbl, sha, &sip, dev); if (n) neigh_release(n); @@ -887,10 +906,12 @@ static int arp_process(struct sk_buff *skb) if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED || skb->pkt_type == PACKET_HOST || in_dev->arp_parms->proxy_delay == 0) { - arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha); + arp_send(ARPOP_REPLY, ETH_P_ARP, sip, + dev, tip, sha, dev->dev_addr, + sha); } else { - pneigh_enqueue(&arp_tbl, in_dev->arp_parms, skb); - in_dev_put(in_dev); + pneigh_enqueue(&arp_tbl, + in_dev->arp_parms, skb); return 0; } goto out; @@ -931,13 +952,12 @@ static int arp_process(struct sk_buff *skb) if (arp->ar_op != htons(ARPOP_REPLY) || skb->pkt_type != PACKET_HOST) state = NUD_STALE; - neigh_update(n, sha, state, override ? NEIGH_UPDATE_F_OVERRIDE : 0); + neigh_update(n, sha, state, + override ? NEIGH_UPDATE_F_OVERRIDE : 0); neigh_release(n); } out: - if (in_dev) - in_dev_put(in_dev); consume_skb(skb); return 0; } @@ -969,7 +989,8 @@ static int arp_rcv(struct sk_buff *skb, struct net_device *dev, arp->ar_pln != 4) goto freeskb; - if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) + skb = skb_share_check(skb, GFP_ATOMIC); + if (skb == NULL) goto out_of_mem; memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb)); @@ -1013,7 +1034,7 @@ static int arp_req_set_public(struct net *net, struct arpreq *r, return -EINVAL; if (!dev && (r->arp_flags & ATF_COM)) { dev = dev_getbyhwaddr(net, r->arp_ha.sa_family, - r->arp_ha.sa_data); + r->arp_ha.sa_data); if (!dev) return -ENODEV; } @@ -1027,7 +1048,7 @@ static int arp_req_set_public(struct net *net, struct arpreq *r, } static int arp_req_set(struct net *net, struct arpreq *r, - struct net_device * dev) + struct net_device *dev) { __be32 ip; struct neighbour *neigh; @@ -1040,12 +1061,13 @@ static int arp_req_set(struct net *net, struct arpreq *r, if (r->arp_flags & ATF_PERM) r->arp_flags |= ATF_COM; if (dev == NULL) { - struct flowi fl = { .nl_u = { .ip4_u = { .daddr = ip, - .tos = RTO_ONLINK } } }; - struct rtable * rt; - if ((err = ip_route_output_key(net, &rt, &fl)) != 0) + struct flowi fl = { .nl_u.ip4_u = { .daddr = ip, + .tos = RTO_ONLINK } }; + struct rtable *rt; + err = ip_route_output_key(net, &rt, &fl); + if (err != 0) return err; - dev = rt->u.dst.dev; + dev = rt->dst.dev; ip_rt_put(rt); if (!dev) return -EINVAL; @@ -1077,9 +1099,9 @@ static int arp_req_set(struct net *net, struct arpreq *r, unsigned state = NUD_STALE; if (r->arp_flags & ATF_PERM) state = NUD_PERMANENT; - err = neigh_update(neigh, (r->arp_flags&ATF_COM) ? + err = neigh_update(neigh, (r->arp_flags & ATF_COM) ? r->arp_ha.sa_data : NULL, state, - NEIGH_UPDATE_F_OVERRIDE| + NEIGH_UPDATE_F_OVERRIDE | NEIGH_UPDATE_F_ADMIN); neigh_release(neigh); } @@ -1088,12 +1110,12 @@ static int arp_req_set(struct net *net, struct arpreq *r, static unsigned arp_state_to_flags(struct neighbour *neigh) { - unsigned flags = 0; if (neigh->nud_state&NUD_PERMANENT) - flags = ATF_PERM|ATF_COM; + return ATF_PERM | ATF_COM; else if (neigh->nud_state&NUD_VALID) - flags = ATF_COM; - return flags; + return ATF_COM; + else + return 0; } /* @@ -1136,7 +1158,7 @@ static int arp_req_delete_public(struct net *net, struct arpreq *r, } static int arp_req_delete(struct net *net, struct arpreq *r, - struct net_device * dev) + struct net_device *dev) { int err; __be32 ip; @@ -1147,12 +1169,13 @@ static int arp_req_delete(struct net *net, struct arpreq *r, ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; if (dev == NULL) { - struct flowi fl = { .nl_u = { .ip4_u = { .daddr = ip, - .tos = RTO_ONLINK } } }; - struct rtable * rt; - if ((err = ip_route_output_key(net, &rt, &fl)) != 0) + struct flowi fl = { .nl_u.ip4_u = { .daddr = ip, + .tos = RTO_ONLINK } }; + struct rtable *rt; + err = ip_route_output_key(net, &rt, &fl); + if (err != 0) return err; - dev = rt->u.dst.dev; + dev = rt->dst.dev; ip_rt_put(rt); if (!dev) return -EINVAL; @@ -1160,7 +1183,7 @@ static int arp_req_delete(struct net *net, struct arpreq *r, err = -ENXIO; neigh = neigh_lookup(&arp_tbl, &ip, dev); if (neigh) { - if (neigh->nud_state&~NUD_NOARP) + if (neigh->nud_state & ~NUD_NOARP) err = neigh_update(neigh, NULL, NUD_FAILED, NEIGH_UPDATE_F_OVERRIDE| NEIGH_UPDATE_F_ADMIN); @@ -1180,24 +1203,24 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg) struct net_device *dev = NULL; switch (cmd) { - case SIOCDARP: - case SIOCSARP: - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - case SIOCGARP: - err = copy_from_user(&r, arg, sizeof(struct arpreq)); - if (err) - return -EFAULT; - break; - default: - return -EINVAL; + case SIOCDARP: + case SIOCSARP: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + case SIOCGARP: + err = copy_from_user(&r, arg, sizeof(struct arpreq)); + if (err) + return -EFAULT; + break; + default: + return -EINVAL; } if (r.arp_pa.sa_family != AF_INET) return -EPFNOSUPPORT; if (!(r.arp_flags & ATF_PUBL) && - (r.arp_flags & (ATF_NETMASK|ATF_DONTPUB))) + (r.arp_flags & (ATF_NETMASK | ATF_DONTPUB))) return -EINVAL; if (!(r.arp_flags & ATF_NETMASK)) ((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr = @@ -1205,7 +1228,8 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg) rtnl_lock(); if (r.arp_dev[0]) { err = -ENODEV; - if ((dev = __dev_get_by_name(net, r.arp_dev)) == NULL) + dev = __dev_get_by_name(net, r.arp_dev); + if (dev == NULL) goto out; /* Mmmm... It is wrong... ARPHRD_NETROM==0 */ @@ -1237,7 +1261,8 @@ out: return err; } -static int arp_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) +static int arp_netdev_event(struct notifier_block *this, unsigned long event, + void *ptr) { struct net_device *dev = ptr; @@ -1305,12 +1330,13 @@ static char *ax2asc2(ax25_address *a, char *buf) for (n = 0, s = buf; n < 6; n++) { c = (a->ax25_call[n] >> 1) & 0x7F; - if (c != ' ') *s++ = c; + if (c != ' ') + *s++ = c; } *s++ = '-'; - - if ((n = ((a->ax25_call[6] >> 1) & 0x0F)) > 9) { + n = (a->ax25_call[6] >> 1) & 0x0F; + if (n > 9) { *s++ = '1'; n -= 10; } @@ -1319,10 +1345,9 @@ static char *ax2asc2(ax25_address *a, char *buf) *s++ = '\0'; if (*buf == '\0' || *buf == '-') - return "*"; + return "*"; return buf; - } #endif /* CONFIG_AX25 */ @@ -1402,10 +1427,10 @@ static void *arp_seq_start(struct seq_file *seq, loff_t *pos) /* ------------------------------------------------------------------------ */ static const struct seq_operations arp_seq_ops = { - .start = arp_seq_start, - .next = neigh_seq_next, - .stop = neigh_seq_stop, - .show = arp_seq_show, + .start = arp_seq_start, + .next = neigh_seq_next, + .stop = neigh_seq_stop, + .show = arp_seq_show, }; static int arp_seq_open(struct inode *inode, struct file *file) @@ -1453,14 +1478,3 @@ static int __init arp_proc_init(void) } #endif /* CONFIG_PROC_FS */ - -EXPORT_SYMBOL(arp_broken_ops); -EXPORT_SYMBOL(arp_find); -EXPORT_SYMBOL(arp_create); -EXPORT_SYMBOL(arp_xmit); -EXPORT_SYMBOL(arp_send); -EXPORT_SYMBOL(arp_tbl); - -#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE) -EXPORT_SYMBOL(clip_tbl_hook); -#endif diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 3a92a76ae41d..094e150c6260 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -9,7 +9,7 @@ * * The CIPSO draft specification can be found in the kernel's Documentation * directory as well as the following URL: - * http://netlabel.sourceforge.net/files/draft-ietf-cipso-ipsecurity-01.txt + * http://tools.ietf.org/id/draft-ietf-cipso-ipsecurity-01.txt * The FIPS-188 specification can be found at the following URL: * http://www.itl.nist.gov/fipspubs/fip188.htm * diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index fb2465811b48..174be6caa5c8 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -62,16 +62,17 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) } if (!inet->inet_saddr) inet->inet_saddr = rt->rt_src; /* Update source address */ - if (!inet->inet_rcv_saddr) + if (!inet->inet_rcv_saddr) { inet->inet_rcv_saddr = rt->rt_src; + if (sk->sk_prot->rehash) + sk->sk_prot->rehash(sk); + } inet->inet_daddr = rt->rt_dst; inet->inet_dport = usin->sin_port; sk->sk_state = TCP_ESTABLISHED; inet->inet_id = jiffies; - sk_dst_set(sk, &rt->u.dst); - return(0); + sk_dst_set(sk, &rt->dst); + return 0; } - EXPORT_SYMBOL(ip4_datagram_connect); - diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 382bc768ed56..dc94b0316b78 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -209,7 +209,7 @@ static void inetdev_destroy(struct in_device *in_dev) inet_free_ifa(ifa); } - dev->ip_ptr = NULL; + rcu_assign_pointer(dev->ip_ptr, NULL); devinet_sysctl_unregister(in_dev); neigh_parms_release(&arp_tbl, in_dev->arp_parms); @@ -403,6 +403,9 @@ static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa) return inet_insert_ifa(ifa); } +/* Caller must hold RCU or RTNL : + * We dont take a reference on found in_device + */ struct in_device *inetdev_by_index(struct net *net, int ifindex) { struct net_device *dev; @@ -411,7 +414,7 @@ struct in_device *inetdev_by_index(struct net *net, int ifindex) rcu_read_lock(); dev = dev_get_by_index_rcu(net, ifindex); if (dev) - in_dev = in_dev_get(dev); + in_dev = rcu_dereference_rtnl(dev->ip_ptr); rcu_read_unlock(); return in_dev; } @@ -453,8 +456,6 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg goto errout; } - __in_dev_put(in_dev); - for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL; ifap = &ifa->ifa_next) { if (tb[IFA_LOCAL] && @@ -1059,7 +1060,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event, switch (event) { case NETDEV_REGISTER: printk(KERN_DEBUG "inetdev_event: bug\n"); - dev->ip_ptr = NULL; + rcu_assign_pointer(dev->ip_ptr, NULL); break; case NETDEV_UP: if (!inetdev_valid_mtu(dev->mtu)) @@ -1081,6 +1082,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event, } ip_mc_up(in_dev); /* fall through */ + case NETDEV_NOTIFY_PEERS: case NETDEV_CHANGEADDR: /* Send gratuitous ARP to notify of link change */ if (IN_DEV_ARP_NOTIFY(in_dev)) { diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 4f0ed458c883..eb6f69a8f27a 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -147,34 +147,43 @@ static void fib_flush(struct net *net) rt_cache_flush(net, -1); } -/* - * Find the first device with a given source address. +/** + * __ip_dev_find - find the first device with a given source address. + * @net: the net namespace + * @addr: the source address + * @devref: if true, take a reference on the found device + * + * If a caller uses devref=false, it should be protected by RCU, or RTNL */ - -struct net_device * ip_dev_find(struct net *net, __be32 addr) +struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref) { - struct flowi fl = { .nl_u = { .ip4_u = { .daddr = addr } } }; - struct fib_result res; + struct flowi fl = { + .nl_u = { + .ip4_u = { + .daddr = addr + } + }, + .flags = FLOWI_FLAG_MATCH_ANY_IIF + }; + struct fib_result res = { 0 }; struct net_device *dev = NULL; - struct fib_table *local_table; - -#ifdef CONFIG_IP_MULTIPLE_TABLES - res.r = NULL; -#endif - local_table = fib_get_table(net, RT_TABLE_LOCAL); - if (!local_table || fib_table_lookup(local_table, &fl, &res)) + rcu_read_lock(); + if (fib_lookup(net, &fl, &res)) { + rcu_read_unlock(); return NULL; + } if (res.type != RTN_LOCAL) goto out; dev = FIB_RES_DEV(res); - if (dev) + if (dev && devref) dev_hold(dev); out: - fib_res_put(&res); + rcu_read_unlock(); return dev; } +EXPORT_SYMBOL(__ip_dev_find); /* * Find address type as if only "dev" was present in the system. If @@ -201,11 +210,12 @@ static inline unsigned __inet_dev_addr_type(struct net *net, local_table = fib_get_table(net, RT_TABLE_LOCAL); if (local_table) { ret = RTN_UNICAST; - if (!fib_table_lookup(local_table, &fl, &res)) { + rcu_read_lock(); + if (!fib_table_lookup(local_table, &fl, &res, FIB_LOOKUP_NOREF)) { if (!dev || dev == res.fi->fib_dev) ret = res.type; - fib_res_put(&res); } + rcu_read_unlock(); } return ret; } @@ -214,40 +224,46 @@ unsigned int inet_addr_type(struct net *net, __be32 addr) { return __inet_dev_addr_type(net, NULL, addr); } +EXPORT_SYMBOL(inet_addr_type); unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev, __be32 addr) { - return __inet_dev_addr_type(net, dev, addr); + return __inet_dev_addr_type(net, dev, addr); } +EXPORT_SYMBOL(inet_dev_addr_type); /* Given (packet source, input interface) and optional (dst, oif, tos): - - (main) check, that source is valid i.e. not broadcast or our local - address. - - figure out what "logical" interface this packet arrived - and calculate "specific destination" address. - - check, that packet arrived from expected physical interface. + * - (main) check, that source is valid i.e. not broadcast or our local + * address. + * - figure out what "logical" interface this packet arrived + * and calculate "specific destination" address. + * - check, that packet arrived from expected physical interface. + * called with rcu_read_lock() */ - int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, struct net_device *dev, __be32 *spec_dst, u32 *itag, u32 mark) { struct in_device *in_dev; - struct flowi fl = { .nl_u = { .ip4_u = - { .daddr = src, - .saddr = dst, - .tos = tos } }, - .mark = mark, - .iif = oif }; - + struct flowi fl = { + .nl_u = { + .ip4_u = { + .daddr = src, + .saddr = dst, + .tos = tos + } + }, + .mark = mark, + .iif = oif + }; struct fib_result res; int no_addr, rpf, accept_local; + bool dev_match; int ret; struct net *net; no_addr = rpf = accept_local = 0; - rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); if (in_dev) { no_addr = in_dev->ifa_list == NULL; @@ -256,7 +272,6 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, if (mark && !IN_DEV_SRC_VMARK(in_dev)) fl.mark = 0; } - rcu_read_unlock(); if (in_dev == NULL) goto e_inval; @@ -266,25 +281,33 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, goto last_resort; if (res.type != RTN_UNICAST) { if (res.type != RTN_LOCAL || !accept_local) - goto e_inval_res; + goto e_inval; } *spec_dst = FIB_RES_PREFSRC(res); fib_combine_itag(itag, &res); + dev_match = false; + #ifdef CONFIG_IP_ROUTE_MULTIPATH - if (FIB_RES_DEV(res) == dev || res.fi->fib_nhs > 1) + for (ret = 0; ret < res.fi->fib_nhs; ret++) { + struct fib_nh *nh = &res.fi->fib_nh[ret]; + + if (nh->nh_dev == dev) { + dev_match = true; + break; + } + } #else if (FIB_RES_DEV(res) == dev) + dev_match = true; #endif - { + if (dev_match) { ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; - fib_res_put(&res); return ret; } - fib_res_put(&res); if (no_addr) goto last_resort; if (rpf == 1) - goto e_inval; + goto e_rpf; fl.oif = dev->ifindex; ret = 0; @@ -293,21 +316,20 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, *spec_dst = FIB_RES_PREFSRC(res); ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; } - fib_res_put(&res); } return ret; last_resort: if (rpf) - goto e_inval; + goto e_rpf; *spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); *itag = 0; return 0; -e_inval_res: - fib_res_put(&res); e_inval: return -EINVAL; +e_rpf: + return -EXDEV; } static inline __be32 sk_extract_addr(struct sockaddr *addr) @@ -456,9 +478,9 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt, } /* - * Handle IP routing ioctl calls. These are used to manipulate the routing tables + * Handle IP routing ioctl calls. + * These are used to manipulate the routing tables */ - int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg) { struct fib_config cfg; @@ -502,7 +524,7 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg) return -EINVAL; } -const struct nla_policy rtm_ipv4_policy[RTA_MAX+1] = { +const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = { [RTA_DST] = { .type = NLA_U32 }, [RTA_SRC] = { .type = NLA_U32 }, [RTA_IIF] = { .type = NLA_U32 }, @@ -516,7 +538,7 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX+1] = { }; static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, - struct nlmsghdr *nlh, struct fib_config *cfg) + struct nlmsghdr *nlh, struct fib_config *cfg) { struct nlattr *attr; int err, remaining; @@ -671,12 +693,11 @@ out: } /* Prepare and feed intra-kernel routing request. - Really, it should be netlink message, but :-( netlink - can be not configured, so that we feed it directly - to fib engine. It is legal, because all events occur - only when netlink is already locked. + * Really, it should be netlink message, but :-( netlink + * can be not configured, so that we feed it directly + * to fib engine. It is legal, because all events occur + * only when netlink is already locked. */ - static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa) { struct net *net = dev_net(ifa->ifa_dev->dev); @@ -722,9 +743,9 @@ void fib_add_ifaddr(struct in_ifaddr *ifa) struct in_ifaddr *prim = ifa; __be32 mask = ifa->ifa_mask; __be32 addr = ifa->ifa_local; - __be32 prefix = ifa->ifa_address&mask; + __be32 prefix = ifa->ifa_address & mask; - if (ifa->ifa_flags&IFA_F_SECONDARY) { + if (ifa->ifa_flags & IFA_F_SECONDARY) { prim = inet_ifa_byprefix(in_dev, prefix, mask); if (prim == NULL) { printk(KERN_WARNING "fib_add_ifaddr: bug: prim == NULL\n"); @@ -734,22 +755,24 @@ void fib_add_ifaddr(struct in_ifaddr *ifa) fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim); - if (!(dev->flags&IFF_UP)) + if (!(dev->flags & IFF_UP)) return; /* Add broadcast address, if it is explicitly assigned. */ if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF)) fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim); - if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags&IFA_F_SECONDARY) && + if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) && (prefix != addr || ifa->ifa_prefixlen < 32)) { - fib_magic(RTM_NEWROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL : - RTN_UNICAST, prefix, ifa->ifa_prefixlen, prim); + fib_magic(RTM_NEWROUTE, + dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, + prefix, ifa->ifa_prefixlen, prim); /* Add network specific broadcasts, when it takes a sense */ if (ifa->ifa_prefixlen < 31) { fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, prim); - fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix|~mask, 32, prim); + fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask, + 32, prim); } } } @@ -760,17 +783,18 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa) struct net_device *dev = in_dev->dev; struct in_ifaddr *ifa1; struct in_ifaddr *prim = ifa; - __be32 brd = ifa->ifa_address|~ifa->ifa_mask; - __be32 any = ifa->ifa_address&ifa->ifa_mask; + __be32 brd = ifa->ifa_address | ~ifa->ifa_mask; + __be32 any = ifa->ifa_address & ifa->ifa_mask; #define LOCAL_OK 1 #define BRD_OK 2 #define BRD0_OK 4 #define BRD1_OK 8 unsigned ok = 0; - if (!(ifa->ifa_flags&IFA_F_SECONDARY)) - fib_magic(RTM_DELROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL : - RTN_UNICAST, any, ifa->ifa_prefixlen, prim); + if (!(ifa->ifa_flags & IFA_F_SECONDARY)) + fib_magic(RTM_DELROUTE, + dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, + any, ifa->ifa_prefixlen, prim); else { prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask); if (prim == NULL) { @@ -780,9 +804,9 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa) } /* Deletion is more complicated than add. - We should take care of not to delete too much :-) - - Scan address list to be sure that addresses are really gone. + * We should take care of not to delete too much :-) + * + * Scan address list to be sure that addresses are really gone. */ for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) { @@ -796,23 +820,23 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa) ok |= BRD0_OK; } - if (!(ok&BRD_OK)) + if (!(ok & BRD_OK)) fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim); - if (!(ok&BRD1_OK)) + if (!(ok & BRD1_OK)) fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim); - if (!(ok&BRD0_OK)) + if (!(ok & BRD0_OK)) fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim); - if (!(ok&LOCAL_OK)) { + if (!(ok & LOCAL_OK)) { fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim); /* Check, that this local address finally disappeared. */ if (inet_addr_type(dev_net(dev), ifa->ifa_local) != RTN_LOCAL) { /* And the last, but not the least thing. - We must flush stray FIB entries. - - First of all, we scan fib_info list searching - for stray nexthop entries, then ignite fib_flush. - */ + * We must flush stray FIB entries. + * + * First of all, we scan fib_info list searching + * for stray nexthop entries, then ignite fib_flush. + */ if (fib_sync_down_addr(dev_net(dev), ifa->ifa_local)) fib_flush(dev_net(dev)); } @@ -823,14 +847,20 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa) #undef BRD1_OK } -static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb ) +static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb) { struct fib_result res; - struct flowi fl = { .mark = frn->fl_mark, - .nl_u = { .ip4_u = { .daddr = frn->fl_addr, - .tos = frn->fl_tos, - .scope = frn->fl_scope } } }; + struct flowi fl = { + .mark = frn->fl_mark, + .nl_u = { + .ip4_u = { + .daddr = frn->fl_addr, + .tos = frn->fl_tos, + .scope = frn->fl_scope + } + } + }; #ifdef CONFIG_IP_MULTIPLE_TABLES res.r = NULL; @@ -841,15 +871,16 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb ) local_bh_disable(); frn->tb_id = tb->tb_id; - frn->err = fib_table_lookup(tb, &fl, &res); + rcu_read_lock(); + frn->err = fib_table_lookup(tb, &fl, &res, FIB_LOOKUP_NOREF); if (!frn->err) { frn->prefixlen = res.prefixlen; frn->nh_sel = res.nh_sel; frn->type = res.type; frn->scope = res.scope; - fib_res_put(&res); } + rcu_read_unlock(); local_bh_enable(); } } @@ -878,8 +909,8 @@ static void nl_fib_input(struct sk_buff *skb) nl_fib_lookup(frn, tb); - pid = NETLINK_CB(skb).pid; /* pid of sending process */ - NETLINK_CB(skb).pid = 0; /* from kernel */ + pid = NETLINK_CB(skb).pid; /* pid of sending process */ + NETLINK_CB(skb).pid = 0; /* from kernel */ NETLINK_CB(skb).dst_group = 0; /* unicast */ netlink_unicast(net->ipv4.fibnl, skb, pid, MSG_DONTWAIT); } @@ -926,7 +957,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, fib_del_ifaddr(ifa); if (ifa->ifa_dev->ifa_list == NULL) { /* Last address was deleted from this interface. - Disable IP. + * Disable IP. */ fib_disable_ip(dev, 1, 0); } else { @@ -985,16 +1016,15 @@ static struct notifier_block fib_netdev_notifier = { static int __net_init ip_fib_net_init(struct net *net) { int err; - unsigned int i; + size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ; - net->ipv4.fib_table_hash = kzalloc( - sizeof(struct hlist_head)*FIB_TABLE_HASHSZ, GFP_KERNEL); + /* Avoid false sharing : Use at least a full cache line */ + size = max_t(size_t, size, L1_CACHE_BYTES); + + net->ipv4.fib_table_hash = kzalloc(size, GFP_KERNEL); if (net->ipv4.fib_table_hash == NULL) return -ENOMEM; - for (i = 0; i < FIB_TABLE_HASHSZ; i++) - INIT_HLIST_HEAD(&net->ipv4.fib_table_hash[i]); - err = fib4_rules_init(net); if (err < 0) goto fail; @@ -1022,7 +1052,7 @@ static void ip_fib_net_exit(struct net *net) hlist_for_each_entry_safe(tb, node, tmp, head, tb_hlist) { hlist_del(node); fib_table_flush(tb); - kfree(tb); + fib_free_table(tb); } } kfree(net->ipv4.fib_table_hash); @@ -1075,7 +1105,3 @@ void __init ip_fib_init(void) fib_hash_init(); } - -EXPORT_SYMBOL(inet_addr_type); -EXPORT_SYMBOL(inet_dev_addr_type); -EXPORT_SYMBOL(ip_dev_find); diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c index 4ed7e0dea1bc..b3acb0417b21 100644 --- a/net/ipv4/fib_hash.c +++ b/net/ipv4/fib_hash.c @@ -54,36 +54,37 @@ struct fib_node { struct fib_alias fn_embedded_alias; }; -struct fn_zone { - struct fn_zone *fz_next; /* Next not empty zone */ - struct hlist_head *fz_hash; /* Hash table pointer */ - int fz_nent; /* Number of entries */ +#define EMBEDDED_HASH_SIZE (L1_CACHE_BYTES / sizeof(struct hlist_head)) - int fz_divisor; /* Hash divisor */ +struct fn_zone { + struct fn_zone __rcu *fz_next; /* Next not empty zone */ + struct hlist_head __rcu *fz_hash; /* Hash table pointer */ + seqlock_t fz_lock; u32 fz_hashmask; /* (fz_divisor - 1) */ -#define FZ_HASHMASK(fz) ((fz)->fz_hashmask) - int fz_order; /* Zone order */ - __be32 fz_mask; + u8 fz_order; /* Zone order (0..32) */ + u8 fz_revorder; /* 32 - fz_order */ + __be32 fz_mask; /* inet_make_mask(order) */ #define FZ_MASK(fz) ((fz)->fz_mask) -}; -/* NOTE. On fast computers evaluation of fz_hashmask and fz_mask - * can be cheaper than memory lookup, so that FZ_* macros are used. - */ + struct hlist_head fz_embedded_hash[EMBEDDED_HASH_SIZE]; + + int fz_nent; /* Number of entries */ + int fz_divisor; /* Hash size (mask+1) */ +}; struct fn_hash { - struct fn_zone *fn_zones[33]; - struct fn_zone *fn_zone_list; + struct fn_zone *fn_zones[33]; + struct fn_zone __rcu *fn_zone_list; }; static inline u32 fn_hash(__be32 key, struct fn_zone *fz) { - u32 h = ntohl(key)>>(32 - fz->fz_order); + u32 h = ntohl(key) >> fz->fz_revorder; h ^= (h>>20); h ^= (h>>10); h ^= (h>>5); - h &= FZ_HASHMASK(fz); + h &= fz->fz_hashmask; return h; } @@ -92,7 +93,6 @@ static inline __be32 fz_key(__be32 dst, struct fn_zone *fz) return dst & FZ_MASK(fz); } -static DEFINE_RWLOCK(fib_hash_lock); static unsigned int fib_hash_genid; #define FZ_MAX_DIVISOR ((PAGE_SIZE<<MAX_ORDER) / sizeof(struct hlist_head)) @@ -101,12 +101,11 @@ static struct hlist_head *fz_hash_alloc(int divisor) { unsigned long size = divisor * sizeof(struct hlist_head); - if (size <= PAGE_SIZE) { + if (size <= PAGE_SIZE) return kzalloc(size, GFP_KERNEL); - } else { - return (struct hlist_head *) - __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(size)); - } + + return (struct hlist_head *) + __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(size)); } /* The fib hash lock must be held when this is called. */ @@ -123,10 +122,11 @@ static inline void fn_rebuild_zone(struct fn_zone *fz, hlist_for_each_entry_safe(f, node, n, &old_ht[i], fn_hash) { struct hlist_head *new_head; - hlist_del(&f->fn_hash); + hlist_del_rcu(&f->fn_hash); - new_head = &fz->fz_hash[fn_hash(f->fn_key, fz)]; - hlist_add_head(&f->fn_hash, new_head); + new_head = rcu_dereference_protected(fz->fz_hash, 1) + + fn_hash(f->fn_key, fz); + hlist_add_head_rcu(&f->fn_hash, new_head); } } } @@ -147,14 +147,14 @@ static void fn_rehash_zone(struct fn_zone *fz) int old_divisor, new_divisor; u32 new_hashmask; - old_divisor = fz->fz_divisor; + new_divisor = old_divisor = fz->fz_divisor; switch (old_divisor) { - case 16: - new_divisor = 256; + case EMBEDDED_HASH_SIZE: + new_divisor *= EMBEDDED_HASH_SIZE; break; - case 256: - new_divisor = 1024; + case EMBEDDED_HASH_SIZE*EMBEDDED_HASH_SIZE: + new_divisor *= (EMBEDDED_HASH_SIZE/2); break; default: if ((old_divisor << 1) > FZ_MAX_DIVISOR) { @@ -175,31 +175,55 @@ static void fn_rehash_zone(struct fn_zone *fz) ht = fz_hash_alloc(new_divisor); if (ht) { - write_lock_bh(&fib_hash_lock); - old_ht = fz->fz_hash; - fz->fz_hash = ht; + struct fn_zone nfz; + + memcpy(&nfz, fz, sizeof(nfz)); + + write_seqlock_bh(&fz->fz_lock); + old_ht = rcu_dereference_protected(fz->fz_hash, 1); + RCU_INIT_POINTER(nfz.fz_hash, ht); + nfz.fz_hashmask = new_hashmask; + nfz.fz_divisor = new_divisor; + fn_rebuild_zone(&nfz, old_ht, old_divisor); + fib_hash_genid++; + rcu_assign_pointer(fz->fz_hash, ht); fz->fz_hashmask = new_hashmask; fz->fz_divisor = new_divisor; - fn_rebuild_zone(fz, old_ht, old_divisor); - fib_hash_genid++; - write_unlock_bh(&fib_hash_lock); + write_sequnlock_bh(&fz->fz_lock); - fz_hash_free(old_ht, old_divisor); + if (old_ht != fz->fz_embedded_hash) { + synchronize_rcu(); + fz_hash_free(old_ht, old_divisor); + } } } -static inline void fn_free_node(struct fib_node * f) +static void fn_free_node_rcu(struct rcu_head *head) { + struct fib_node *f = container_of(head, struct fib_node, fn_embedded_alias.rcu); + kmem_cache_free(fn_hash_kmem, f); } +static inline void fn_free_node(struct fib_node *f) +{ + call_rcu(&f->fn_embedded_alias.rcu, fn_free_node_rcu); +} + +static void fn_free_alias_rcu(struct rcu_head *head) +{ + struct fib_alias *fa = container_of(head, struct fib_alias, rcu); + + kmem_cache_free(fn_alias_kmem, fa); +} + static inline void fn_free_alias(struct fib_alias *fa, struct fib_node *f) { fib_release_info(fa->fa_info); if (fa == &f->fn_embedded_alias) fa->fa_info = NULL; else - kmem_cache_free(fn_alias_kmem, fa); + call_rcu(&fa->rcu, fn_free_alias_rcu); } static struct fn_zone * @@ -210,68 +234,71 @@ fn_new_zone(struct fn_hash *table, int z) if (!fz) return NULL; - if (z) { - fz->fz_divisor = 16; - } else { - fz->fz_divisor = 1; - } - fz->fz_hashmask = (fz->fz_divisor - 1); - fz->fz_hash = fz_hash_alloc(fz->fz_divisor); - if (!fz->fz_hash) { - kfree(fz); - return NULL; - } + seqlock_init(&fz->fz_lock); + fz->fz_divisor = z ? EMBEDDED_HASH_SIZE : 1; + fz->fz_hashmask = fz->fz_divisor - 1; + RCU_INIT_POINTER(fz->fz_hash, fz->fz_embedded_hash); fz->fz_order = z; + fz->fz_revorder = 32 - z; fz->fz_mask = inet_make_mask(z); /* Find the first not empty zone with more specific mask */ - for (i=z+1; i<=32; i++) + for (i = z + 1; i <= 32; i++) if (table->fn_zones[i]) break; - write_lock_bh(&fib_hash_lock); - if (i>32) { + if (i > 32) { /* No more specific masks, we are the first. */ - fz->fz_next = table->fn_zone_list; - table->fn_zone_list = fz; + rcu_assign_pointer(fz->fz_next, + rtnl_dereference(table->fn_zone_list)); + rcu_assign_pointer(table->fn_zone_list, fz); } else { - fz->fz_next = table->fn_zones[i]->fz_next; - table->fn_zones[i]->fz_next = fz; + rcu_assign_pointer(fz->fz_next, + rtnl_dereference(table->fn_zones[i]->fz_next)); + rcu_assign_pointer(table->fn_zones[i]->fz_next, fz); } table->fn_zones[z] = fz; fib_hash_genid++; - write_unlock_bh(&fib_hash_lock); return fz; } int fib_table_lookup(struct fib_table *tb, - const struct flowi *flp, struct fib_result *res) + const struct flowi *flp, struct fib_result *res, + int fib_flags) { int err; struct fn_zone *fz; struct fn_hash *t = (struct fn_hash *)tb->tb_data; - read_lock(&fib_hash_lock); - for (fz = t->fn_zone_list; fz; fz = fz->fz_next) { + rcu_read_lock(); + for (fz = rcu_dereference(t->fn_zone_list); + fz != NULL; + fz = rcu_dereference(fz->fz_next)) { struct hlist_head *head; struct hlist_node *node; struct fib_node *f; - __be32 k = fz_key(flp->fl4_dst, fz); + __be32 k; + unsigned int seq; - head = &fz->fz_hash[fn_hash(k, fz)]; - hlist_for_each_entry(f, node, head, fn_hash) { - if (f->fn_key != k) - continue; + do { + seq = read_seqbegin(&fz->fz_lock); + k = fz_key(flp->fl4_dst, fz); + + head = rcu_dereference(fz->fz_hash) + fn_hash(k, fz); + hlist_for_each_entry_rcu(f, node, head, fn_hash) { + if (f->fn_key != k) + continue; - err = fib_semantic_match(&f->fn_alias, + err = fib_semantic_match(&f->fn_alias, flp, res, - fz->fz_order); - if (err <= 0) - goto out; - } + fz->fz_order, fib_flags); + if (err <= 0) + goto out; + } + } while (read_seqretry(&fz->fz_lock, seq)); } err = 1; out: - read_unlock(&fib_hash_lock); + rcu_read_unlock(); return err; } @@ -285,6 +312,7 @@ void fib_table_select_default(struct fib_table *tb, struct fib_info *last_resort; struct fn_hash *t = (struct fn_hash *)tb->tb_data; struct fn_zone *fz = t->fn_zones[0]; + struct hlist_head *head; if (fz == NULL) return; @@ -293,11 +321,12 @@ void fib_table_select_default(struct fib_table *tb, last_resort = NULL; order = -1; - read_lock(&fib_hash_lock); - hlist_for_each_entry(f, node, &fz->fz_hash[0], fn_hash) { + rcu_read_lock(); + head = rcu_dereference(fz->fz_hash); + hlist_for_each_entry_rcu(f, node, head, fn_hash) { struct fib_alias *fa; - list_for_each_entry(fa, &f->fn_alias, fa_list) { + list_for_each_entry_rcu(fa, &f->fn_alias, fa_list) { struct fib_info *next_fi = fa->fa_info; if (fa->fa_scope != res->scope || @@ -309,7 +338,8 @@ void fib_table_select_default(struct fib_table *tb, if (!next_fi->fib_nh[0].nh_gw || next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK) continue; - fa->fa_state |= FA_S_ACCESSED; + + fib_alias_accessed(fa); if (fi == NULL) { if (next_fi != res->fi) @@ -341,25 +371,25 @@ void fib_table_select_default(struct fib_table *tb, fib_result_assign(res, last_resort); tb->tb_default = last_idx; out: - read_unlock(&fib_hash_lock); + rcu_read_unlock(); } /* Insert node F to FZ. */ static inline void fib_insert_node(struct fn_zone *fz, struct fib_node *f) { - struct hlist_head *head = &fz->fz_hash[fn_hash(f->fn_key, fz)]; + struct hlist_head *head = rtnl_dereference(fz->fz_hash) + fn_hash(f->fn_key, fz); - hlist_add_head(&f->fn_hash, head); + hlist_add_head_rcu(&f->fn_hash, head); } /* Return the node in FZ matching KEY. */ static struct fib_node *fib_find_node(struct fn_zone *fz, __be32 key) { - struct hlist_head *head = &fz->fz_hash[fn_hash(key, fz)]; + struct hlist_head *head = rtnl_dereference(fz->fz_hash) + fn_hash(key, fz); struct hlist_node *node; struct fib_node *f; - hlist_for_each_entry(f, node, head, fn_hash) { + hlist_for_each_entry_rcu(f, node, head, fn_hash) { if (f->fn_key == key) return f; } @@ -367,6 +397,17 @@ static struct fib_node *fib_find_node(struct fn_zone *fz, __be32 key) return NULL; } + +static struct fib_alias *fib_fast_alloc(struct fib_node *f) +{ + struct fib_alias *fa = &f->fn_embedded_alias; + + if (fa->fa_info != NULL) + fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL); + return fa; +} + +/* Caller must hold RTNL. */ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) { struct fn_hash *table = (struct fn_hash *) tb->tb_data; @@ -451,7 +492,6 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) } if (cfg->fc_nlflags & NLM_F_REPLACE) { - struct fib_info *fi_drop; u8 state; fa = fa_first; @@ -460,21 +500,25 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) err = 0; goto out; } - write_lock_bh(&fib_hash_lock); - fi_drop = fa->fa_info; - fa->fa_info = fi; - fa->fa_type = cfg->fc_type; - fa->fa_scope = cfg->fc_scope; + err = -ENOBUFS; + new_fa = fib_fast_alloc(f); + if (new_fa == NULL) + goto out; + + new_fa->fa_tos = fa->fa_tos; + new_fa->fa_info = fi; + new_fa->fa_type = cfg->fc_type; + new_fa->fa_scope = cfg->fc_scope; state = fa->fa_state; - fa->fa_state &= ~FA_S_ACCESSED; + new_fa->fa_state = state & ~FA_S_ACCESSED; fib_hash_genid++; - write_unlock_bh(&fib_hash_lock); + list_replace_rcu(&fa->fa_list, &new_fa->fa_list); - fib_release_info(fi_drop); + fn_free_alias(fa, f); if (state & FA_S_ACCESSED) rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); - rtmsg_fib(RTM_NEWROUTE, key, fa, cfg->fc_dst_len, tb->tb_id, - &cfg->fc_nlinfo, NLM_F_REPLACE); + rtmsg_fib(RTM_NEWROUTE, key, new_fa, cfg->fc_dst_len, + tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE); return 0; } @@ -506,12 +550,10 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) f = new_f; } - new_fa = &f->fn_embedded_alias; - if (new_fa->fa_info != NULL) { - new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL); - if (new_fa == NULL) - goto out; - } + new_fa = fib_fast_alloc(f); + if (new_fa == NULL) + goto out; + new_fa->fa_info = fi; new_fa->fa_tos = tos; new_fa->fa_type = cfg->fc_type; @@ -522,13 +564,11 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) * Insert new entry to the list. */ - write_lock_bh(&fib_hash_lock); if (new_f) fib_insert_node(fz, new_f); - list_add_tail(&new_fa->fa_list, + list_add_tail_rcu(&new_fa->fa_list, (fa ? &fa->fa_list : &f->fn_alias)); fib_hash_genid++; - write_unlock_bh(&fib_hash_lock); if (new_f) fz->fz_nent++; @@ -603,14 +643,12 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) tb->tb_id, &cfg->fc_nlinfo, 0); kill_fn = 0; - write_lock_bh(&fib_hash_lock); - list_del(&fa->fa_list); + list_del_rcu(&fa->fa_list); if (list_empty(&f->fn_alias)) { - hlist_del(&f->fn_hash); + hlist_del_rcu(&f->fn_hash); kill_fn = 1; } fib_hash_genid++; - write_unlock_bh(&fib_hash_lock); if (fa->fa_state & FA_S_ACCESSED) rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); @@ -627,7 +665,7 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) static int fn_flush_list(struct fn_zone *fz, int idx) { - struct hlist_head *head = &fz->fz_hash[idx]; + struct hlist_head *head = rtnl_dereference(fz->fz_hash) + idx; struct hlist_node *node, *n; struct fib_node *f; int found = 0; @@ -641,14 +679,12 @@ static int fn_flush_list(struct fn_zone *fz, int idx) struct fib_info *fi = fa->fa_info; if (fi && (fi->fib_flags&RTNH_F_DEAD)) { - write_lock_bh(&fib_hash_lock); - list_del(&fa->fa_list); + list_del_rcu(&fa->fa_list); if (list_empty(&f->fn_alias)) { - hlist_del(&f->fn_hash); + hlist_del_rcu(&f->fn_hash); kill_f = 1; } fib_hash_genid++; - write_unlock_bh(&fib_hash_lock); fn_free_alias(fa, f); found++; @@ -662,13 +698,16 @@ static int fn_flush_list(struct fn_zone *fz, int idx) return found; } +/* caller must hold RTNL. */ int fib_table_flush(struct fib_table *tb) { struct fn_hash *table = (struct fn_hash *) tb->tb_data; struct fn_zone *fz; int found = 0; - for (fz = table->fn_zone_list; fz; fz = fz->fz_next) { + for (fz = rtnl_dereference(table->fn_zone_list); + fz != NULL; + fz = rtnl_dereference(fz->fz_next)) { int i; for (i = fz->fz_divisor - 1; i >= 0; i--) @@ -677,6 +716,24 @@ int fib_table_flush(struct fib_table *tb) return found; } +void fib_free_table(struct fib_table *tb) +{ + struct fn_hash *table = (struct fn_hash *) tb->tb_data; + struct fn_zone *fz, *next; + + next = table->fn_zone_list; + while (next != NULL) { + fz = next; + next = fz->fz_next; + + if (fz->fz_hash != fz->fz_embedded_hash) + fz_hash_free(fz->fz_hash, fz->fz_divisor); + + kfree(fz); + } + + kfree(tb); +} static inline int fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb, @@ -690,10 +747,10 @@ fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb, s_i = cb->args[4]; i = 0; - hlist_for_each_entry(f, node, head, fn_hash) { + hlist_for_each_entry_rcu(f, node, head, fn_hash) { struct fib_alias *fa; - list_for_each_entry(fa, &f->fn_alias, fa_list) { + list_for_each_entry_rcu(fa, &f->fn_alias, fa_list) { if (i < s_i) goto next; @@ -711,7 +768,7 @@ fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb, cb->args[4] = i; return -1; } - next: +next: i++; } } @@ -725,14 +782,15 @@ fn_hash_dump_zone(struct sk_buff *skb, struct netlink_callback *cb, struct fn_zone *fz) { int h, s_h; + struct hlist_head *head = rcu_dereference(fz->fz_hash); - if (fz->fz_hash == NULL) + if (head == NULL) return skb->len; s_h = cb->args[3]; for (h = s_h; h < fz->fz_divisor; h++) { - if (hlist_empty(&fz->fz_hash[h])) + if (hlist_empty(head + h)) continue; - if (fn_hash_dump_bucket(skb, cb, tb, fz, &fz->fz_hash[h]) < 0) { + if (fn_hash_dump_bucket(skb, cb, tb, fz, head + h) < 0) { cb->args[3] = h; return -1; } @@ -746,23 +804,26 @@ fn_hash_dump_zone(struct sk_buff *skb, struct netlink_callback *cb, int fib_table_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb) { - int m, s_m; + int m = 0, s_m; struct fn_zone *fz; struct fn_hash *table = (struct fn_hash *)tb->tb_data; s_m = cb->args[2]; - read_lock(&fib_hash_lock); - for (fz = table->fn_zone_list, m=0; fz; fz = fz->fz_next, m++) { - if (m < s_m) continue; + rcu_read_lock(); + for (fz = rcu_dereference(table->fn_zone_list); + fz != NULL; + fz = rcu_dereference(fz->fz_next), m++) { + if (m < s_m) + continue; if (fn_hash_dump_zone(skb, cb, tb, fz) < 0) { cb->args[2] = m; - read_unlock(&fib_hash_lock); + rcu_read_unlock(); return -1; } memset(&cb->args[3], 0, sizeof(cb->args) - 3*sizeof(cb->args[0])); } - read_unlock(&fib_hash_lock); + rcu_read_unlock(); cb->args[2] = m; return skb->len; } @@ -825,14 +886,15 @@ static struct fib_alias *fib_get_first(struct seq_file *seq) iter->genid = fib_hash_genid; iter->valid = 1; - for (iter->zone = table->fn_zone_list; iter->zone; - iter->zone = iter->zone->fz_next) { + for (iter->zone = rcu_dereference(table->fn_zone_list); + iter->zone != NULL; + iter->zone = rcu_dereference(iter->zone->fz_next)) { int maxslot; if (!iter->zone->fz_nent) continue; - iter->hash_head = iter->zone->fz_hash; + iter->hash_head = rcu_dereference(iter->zone->fz_hash); maxslot = iter->zone->fz_divisor; for (iter->bucket = 0; iter->bucket < maxslot; @@ -911,13 +973,13 @@ static struct fib_alias *fib_get_next(struct seq_file *seq) } } - iter->zone = iter->zone->fz_next; + iter->zone = rcu_dereference(iter->zone->fz_next); if (!iter->zone) goto out; iter->bucket = 0; - iter->hash_head = iter->zone->fz_hash; + iter->hash_head = rcu_dereference(iter->zone->fz_hash); hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) { list_for_each_entry(fa, &fn->fn_alias, fa_list) { @@ -950,11 +1012,11 @@ static struct fib_alias *fib_get_idx(struct seq_file *seq, loff_t pos) } static void *fib_seq_start(struct seq_file *seq, loff_t *pos) - __acquires(fib_hash_lock) + __acquires(RCU) { void *v = NULL; - read_lock(&fib_hash_lock); + rcu_read_lock(); if (fib_get_table(seq_file_net(seq), RT_TABLE_MAIN)) v = *pos ? fib_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; return v; @@ -967,15 +1029,16 @@ static void *fib_seq_next(struct seq_file *seq, void *v, loff_t *pos) } static void fib_seq_stop(struct seq_file *seq, void *v) - __releases(fib_hash_lock) + __releases(RCU) { - read_unlock(&fib_hash_lock); + rcu_read_unlock(); } static unsigned fib_flag_trans(int type, __be32 mask, struct fib_info *fi) { static const unsigned type2flags[RTN_MAX + 1] = { - [7] = RTF_REJECT, [8] = RTF_REJECT, + [7] = RTF_REJECT, + [8] = RTF_REJECT, }; unsigned flags = type2flags[type]; diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index 637b133973bd..a29edf2219c8 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -12,17 +12,22 @@ struct fib_alias { u8 fa_type; u8 fa_scope; u8 fa_state; -#ifdef CONFIG_IP_FIB_TRIE struct rcu_head rcu; -#endif }; #define FA_S_ACCESSED 0x01 +/* Dont write on fa_state unless needed, to keep it shared on all cpus */ +static inline void fib_alias_accessed(struct fib_alias *fa) +{ + if (!(fa->fa_state & FA_S_ACCESSED)) + fa->fa_state |= FA_S_ACCESSED; +} + /* Exported by fib_semantics.c */ extern int fib_semantic_match(struct list_head *head, const struct flowi *flp, - struct fib_result *res, int prefixlen); + struct fib_result *res, int prefixlen, int fib_flags); extern void fib_release_info(struct fib_info *); extern struct fib_info *fib_create_info(struct fib_config *cfg); extern int fib_nh_match(struct fib_config *cfg, struct fib_info *fi); diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 76daeb5ff564..7981a24f5c7b 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -6,7 +6,7 @@ * IPv4 Forwarding Information Base: policy rules. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> - * Thomas Graf <tgraf@suug.ch> + * Thomas Graf <tgraf@suug.ch> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -14,7 +14,7 @@ * 2 of the License, or (at your option) any later version. * * Fixes: - * Rani Assaf : local_rule cannot be deleted + * Rani Assaf : local_rule cannot be deleted * Marc Boucher : routing by fwmark */ @@ -32,8 +32,7 @@ #include <net/ip_fib.h> #include <net/fib_rules.h> -struct fib4_rule -{ +struct fib4_rule { struct fib_rule common; u8 dst_len; u8 src_len; @@ -58,6 +57,7 @@ int fib_lookup(struct net *net, struct flowi *flp, struct fib_result *res) { struct fib_lookup_arg arg = { .result = res, + .flags = FIB_LOOKUP_NOREF, }; int err; @@ -91,10 +91,11 @@ static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp, goto errout; } - if ((tbl = fib_get_table(rule->fr_net, rule->table)) == NULL) + tbl = fib_get_table(rule->fr_net, rule->table); + if (!tbl) goto errout; - err = fib_table_lookup(tbl, flp, (struct fib_result *) arg->result); + err = fib_table_lookup(tbl, flp, (struct fib_result *) arg->result, arg->flags); if (err > 0) err = -EAGAIN; errout: diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 20f09c5b31e8..3e0da3ef6116 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -60,21 +60,30 @@ static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE]; static DEFINE_SPINLOCK(fib_multipath_lock); -#define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \ -for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++) - -#define change_nexthops(fi) { int nhsel; struct fib_nh *nexthop_nh; \ -for (nhsel=0, nexthop_nh = (struct fib_nh *)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nexthop_nh++, nhsel++) +#define for_nexthops(fi) { \ + int nhsel; const struct fib_nh *nh; \ + for (nhsel = 0, nh = (fi)->fib_nh; \ + nhsel < (fi)->fib_nhs; \ + nh++, nhsel++) + +#define change_nexthops(fi) { \ + int nhsel; struct fib_nh *nexthop_nh; \ + for (nhsel = 0, nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \ + nhsel < (fi)->fib_nhs; \ + nexthop_nh++, nhsel++) #else /* CONFIG_IP_ROUTE_MULTIPATH */ /* Hope, that gcc will optimize it to get rid of dummy loop */ -#define for_nexthops(fi) { int nhsel = 0; const struct fib_nh * nh = (fi)->fib_nh; \ -for (nhsel=0; nhsel < 1; nhsel++) +#define for_nexthops(fi) { \ + int nhsel; const struct fib_nh *nh = (fi)->fib_nh; \ + for (nhsel = 0; nhsel < 1; nhsel++) -#define change_nexthops(fi) { int nhsel = 0; struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \ -for (nhsel=0; nhsel < 1; nhsel++) +#define change_nexthops(fi) { \ + int nhsel; \ + struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \ + for (nhsel = 0; nhsel < 1; nhsel++) #endif /* CONFIG_IP_ROUTE_MULTIPATH */ @@ -86,63 +95,70 @@ static const struct int error; u8 scope; } fib_props[RTN_MAX + 1] = { - { + [RTN_UNSPEC] = { .error = 0, .scope = RT_SCOPE_NOWHERE, - }, /* RTN_UNSPEC */ - { + }, + [RTN_UNICAST] = { .error = 0, .scope = RT_SCOPE_UNIVERSE, - }, /* RTN_UNICAST */ - { + }, + [RTN_LOCAL] = { .error = 0, .scope = RT_SCOPE_HOST, - }, /* RTN_LOCAL */ - { + }, + [RTN_BROADCAST] = { .error = 0, .scope = RT_SCOPE_LINK, - }, /* RTN_BROADCAST */ - { + }, + [RTN_ANYCAST] = { .error = 0, .scope = RT_SCOPE_LINK, - }, /* RTN_ANYCAST */ - { + }, + [RTN_MULTICAST] = { .error = 0, .scope = RT_SCOPE_UNIVERSE, - }, /* RTN_MULTICAST */ - { + }, + [RTN_BLACKHOLE] = { .error = -EINVAL, .scope = RT_SCOPE_UNIVERSE, - }, /* RTN_BLACKHOLE */ - { + }, + [RTN_UNREACHABLE] = { .error = -EHOSTUNREACH, .scope = RT_SCOPE_UNIVERSE, - }, /* RTN_UNREACHABLE */ - { + }, + [RTN_PROHIBIT] = { .error = -EACCES, .scope = RT_SCOPE_UNIVERSE, - }, /* RTN_PROHIBIT */ - { + }, + [RTN_THROW] = { .error = -EAGAIN, .scope = RT_SCOPE_UNIVERSE, - }, /* RTN_THROW */ - { + }, + [RTN_NAT] = { .error = -EINVAL, .scope = RT_SCOPE_NOWHERE, - }, /* RTN_NAT */ - { + }, + [RTN_XRESOLVE] = { .error = -EINVAL, .scope = RT_SCOPE_NOWHERE, - }, /* RTN_XRESOLVE */ + }, }; /* Release a nexthop info record */ +static void free_fib_info_rcu(struct rcu_head *head) +{ + struct fib_info *fi = container_of(head, struct fib_info, rcu); + + kfree(fi); +} + void free_fib_info(struct fib_info *fi) { if (fi->fib_dead == 0) { - printk(KERN_WARNING "Freeing alive fib_info %p\n", fi); + pr_warning("Freeing alive fib_info %p\n", fi); return; } change_nexthops(fi) { @@ -152,7 +168,7 @@ void free_fib_info(struct fib_info *fi) } endfor_nexthops(fi); fib_info_cnt--; release_net(fi->fib_net); - kfree(fi); + call_rcu(&fi->rcu, free_fib_info_rcu); } void fib_release_info(struct fib_info *fi) @@ -173,7 +189,7 @@ void fib_release_info(struct fib_info *fi) spin_unlock_bh(&fib_info_lock); } -static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi) +static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi) { const struct fib_nh *onh = ofi->fib_nh; @@ -187,7 +203,7 @@ static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info * #ifdef CONFIG_NET_CLS_ROUTE nh->nh_tclassid != onh->nh_tclassid || #endif - ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD)) + ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD)) return -1; onh++; } endfor_nexthops(fi); @@ -238,7 +254,7 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi) nfi->fib_priority == fi->fib_priority && memcmp(nfi->fib_metrics, fi->fib_metrics, sizeof(fi->fib_metrics)) == 0 && - ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 && + ((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_F_DEAD) == 0 && (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0)) return fi; } @@ -247,9 +263,8 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi) } /* Check, that the gateway is already configured. - Used only by redirect accept routine. + * Used only by redirect accept routine. */ - int ip_fib_check_default(__be32 gw, struct net_device *dev) { struct hlist_head *head; @@ -264,7 +279,7 @@ int ip_fib_check_default(__be32 gw, struct net_device *dev) hlist_for_each_entry(nh, node, head, nh_hash) { if (nh->nh_dev == dev && nh->nh_gw == gw && - !(nh->nh_flags&RTNH_F_DEAD)) { + !(nh->nh_flags & RTNH_F_DEAD)) { spin_unlock(&fib_info_lock); return 0; } @@ -362,10 +377,10 @@ int fib_detect_death(struct fib_info *fi, int order, } if (state == NUD_REACHABLE) return 0; - if ((state&NUD_VALID) && order != dflt) + if ((state & NUD_VALID) && order != dflt) return 0; - if ((state&NUD_VALID) || - (*last_idx<0 && order > dflt)) { + if ((state & NUD_VALID) || + (*last_idx < 0 && order > dflt)) { *last_resort = fi; *last_idx = order; } @@ -476,75 +491,76 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi) /* - Picture - ------- - - Semantics of nexthop is very messy by historical reasons. - We have to take into account, that: - a) gateway can be actually local interface address, - so that gatewayed route is direct. - b) gateway must be on-link address, possibly - described not by an ifaddr, but also by a direct route. - c) If both gateway and interface are specified, they should not - contradict. - d) If we use tunnel routes, gateway could be not on-link. - - Attempt to reconcile all of these (alas, self-contradictory) conditions - results in pretty ugly and hairy code with obscure logic. - - I chose to generalized it instead, so that the size - of code does not increase practically, but it becomes - much more general. - Every prefix is assigned a "scope" value: "host" is local address, - "link" is direct route, - [ ... "site" ... "interior" ... ] - and "universe" is true gateway route with global meaning. - - Every prefix refers to a set of "nexthop"s (gw, oif), - where gw must have narrower scope. This recursion stops - when gw has LOCAL scope or if "nexthop" is declared ONLINK, - which means that gw is forced to be on link. - - Code is still hairy, but now it is apparently logically - consistent and very flexible. F.e. as by-product it allows - to co-exists in peace independent exterior and interior - routing processes. - - Normally it looks as following. - - {universe prefix} -> (gw, oif) [scope link] - | - |-> {link prefix} -> (gw, oif) [scope local] - | - |-> {local prefix} (terminal node) + * Picture + * ------- + * + * Semantics of nexthop is very messy by historical reasons. + * We have to take into account, that: + * a) gateway can be actually local interface address, + * so that gatewayed route is direct. + * b) gateway must be on-link address, possibly + * described not by an ifaddr, but also by a direct route. + * c) If both gateway and interface are specified, they should not + * contradict. + * d) If we use tunnel routes, gateway could be not on-link. + * + * Attempt to reconcile all of these (alas, self-contradictory) conditions + * results in pretty ugly and hairy code with obscure logic. + * + * I chose to generalized it instead, so that the size + * of code does not increase practically, but it becomes + * much more general. + * Every prefix is assigned a "scope" value: "host" is local address, + * "link" is direct route, + * [ ... "site" ... "interior" ... ] + * and "universe" is true gateway route with global meaning. + * + * Every prefix refers to a set of "nexthop"s (gw, oif), + * where gw must have narrower scope. This recursion stops + * when gw has LOCAL scope or if "nexthop" is declared ONLINK, + * which means that gw is forced to be on link. + * + * Code is still hairy, but now it is apparently logically + * consistent and very flexible. F.e. as by-product it allows + * to co-exists in peace independent exterior and interior + * routing processes. + * + * Normally it looks as following. + * + * {universe prefix} -> (gw, oif) [scope link] + * | + * |-> {link prefix} -> (gw, oif) [scope local] + * | + * |-> {local prefix} (terminal node) */ - static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, struct fib_nh *nh) { int err; struct net *net; + struct net_device *dev; net = cfg->fc_nlinfo.nl_net; if (nh->nh_gw) { struct fib_result res; - if (nh->nh_flags&RTNH_F_ONLINK) { - struct net_device *dev; + if (nh->nh_flags & RTNH_F_ONLINK) { if (cfg->fc_scope >= RT_SCOPE_LINK) return -EINVAL; if (inet_addr_type(net, nh->nh_gw) != RTN_UNICAST) return -EINVAL; - if ((dev = __dev_get_by_index(net, nh->nh_oif)) == NULL) + dev = __dev_get_by_index(net, nh->nh_oif); + if (!dev) return -ENODEV; - if (!(dev->flags&IFF_UP)) + if (!(dev->flags & IFF_UP)) return -ENETDOWN; nh->nh_dev = dev; dev_hold(dev); nh->nh_scope = RT_SCOPE_LINK; return 0; } + rcu_read_lock(); { struct flowi fl = { .nl_u = { @@ -559,50 +575,53 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, /* It is not necessary, but requires a bit of thinking */ if (fl.fl4_scope < RT_SCOPE_LINK) fl.fl4_scope = RT_SCOPE_LINK; - if ((err = fib_lookup(net, &fl, &res)) != 0) + err = fib_lookup(net, &fl, &res); + if (err) { + rcu_read_unlock(); return err; + } } err = -EINVAL; if (res.type != RTN_UNICAST && res.type != RTN_LOCAL) goto out; nh->nh_scope = res.scope; nh->nh_oif = FIB_RES_OIF(res); - if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL) + nh->nh_dev = dev = FIB_RES_DEV(res); + if (!dev) goto out; - dev_hold(nh->nh_dev); - err = -ENETDOWN; - if (!(nh->nh_dev->flags & IFF_UP)) - goto out; - err = 0; -out: - fib_res_put(&res); - return err; + dev_hold(dev); + err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN; } else { struct in_device *in_dev; - if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK)) + if (nh->nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK)) return -EINVAL; + rcu_read_lock(); + err = -ENODEV; in_dev = inetdev_by_index(net, nh->nh_oif); if (in_dev == NULL) - return -ENODEV; - if (!(in_dev->dev->flags&IFF_UP)) { - in_dev_put(in_dev); - return -ENETDOWN; - } + goto out; + err = -ENETDOWN; + if (!(in_dev->dev->flags & IFF_UP)) + goto out; nh->nh_dev = in_dev->dev; dev_hold(nh->nh_dev); nh->nh_scope = RT_SCOPE_HOST; - in_dev_put(in_dev); + err = 0; } - return 0; +out: + rcu_read_unlock(); + return err; } static inline unsigned int fib_laddr_hashfn(__be32 val) { unsigned int mask = (fib_hash_size - 1); - return ((__force u32)val ^ ((__force u32)val >> 7) ^ ((__force u32)val >> 14)) & mask; + return ((__force u32)val ^ + ((__force u32)val >> 7) ^ + ((__force u32)val >> 14)) & mask; } static struct hlist_head *fib_hash_alloc(int bytes) @@ -611,7 +630,8 @@ static struct hlist_head *fib_hash_alloc(int bytes) return kzalloc(bytes, GFP_KERNEL); else return (struct hlist_head *) - __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(bytes)); + __get_free_pages(GFP_KERNEL | __GFP_ZERO, + get_order(bytes)); } static void fib_hash_free(struct hlist_head *hash, int bytes) @@ -806,7 +826,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg) goto failure; } else { change_nexthops(fi) { - if ((err = fib_check_nh(cfg, fi, nexthop_nh)) != 0) + err = fib_check_nh(cfg, fi, nexthop_nh); + if (err != 0) goto failure; } endfor_nexthops(fi) } @@ -819,7 +840,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg) } link_it: - if ((ofi = fib_find_info(fi)) != NULL) { + ofi = fib_find_info(fi); + if (ofi) { fi->fib_dead = 1; free_fib_info(fi); ofi->fib_treeref++; @@ -864,7 +886,7 @@ failure: /* Note! fib_semantic_match intentionally uses RCU list functions. */ int fib_semantic_match(struct list_head *head, const struct flowi *flp, - struct fib_result *res, int prefixlen) + struct fib_result *res, int prefixlen, int fib_flags) { struct fib_alias *fa; int nh_sel = 0; @@ -879,7 +901,7 @@ int fib_semantic_match(struct list_head *head, const struct flowi *flp, if (fa->fa_scope < flp->fl4_scope) continue; - fa->fa_state |= FA_S_ACCESSED; + fib_alias_accessed(fa); err = fib_props[fa->fa_type].error; if (err == 0) { @@ -895,7 +917,7 @@ int fib_semantic_match(struct list_head *head, const struct flowi *flp, case RTN_ANYCAST: case RTN_MULTICAST: for_nexthops(fi) { - if (nh->nh_flags&RTNH_F_DEAD) + if (nh->nh_flags & RTNH_F_DEAD) continue; if (!flp->oif || flp->oif == nh->nh_oif) break; @@ -906,16 +928,15 @@ int fib_semantic_match(struct list_head *head, const struct flowi *flp, goto out_fill_res; } #else - if (nhsel < 1) { + if (nhsel < 1) goto out_fill_res; - } #endif endfor_nexthops(fi); continue; default: - printk(KERN_WARNING "fib_semantic_match bad type %#x\n", - fa->fa_type); + pr_warning("fib_semantic_match bad type %#x\n", + fa->fa_type); return -EINVAL; } } @@ -929,7 +950,8 @@ out_fill_res: res->type = fa->fa_type; res->scope = fa->fa_scope; res->fi = fa->fa_info; - atomic_inc(&res->fi->fib_clntref); + if (!(fib_flags & FIB_LOOKUP_NOREF)) + atomic_inc(&res->fi->fib_clntref); return 0; } @@ -1028,10 +1050,10 @@ nla_put_failure: } /* - Update FIB if: - - local address disappeared -> we must delete all the entries - referring to it. - - device went down -> we must shutdown all nexthops going via it. + * Update FIB if: + * - local address disappeared -> we must delete all the entries + * referring to it. + * - device went down -> we must shutdown all nexthops going via it. */ int fib_sync_down_addr(struct net *net, __be32 local) { @@ -1078,7 +1100,7 @@ int fib_sync_down_dev(struct net_device *dev, int force) prev_fi = fi; dead = 0; change_nexthops(fi) { - if (nexthop_nh->nh_flags&RTNH_F_DEAD) + if (nexthop_nh->nh_flags & RTNH_F_DEAD) dead++; else if (nexthop_nh->nh_dev == dev && nexthop_nh->nh_scope != scope) { @@ -1110,10 +1132,9 @@ int fib_sync_down_dev(struct net_device *dev, int force) #ifdef CONFIG_IP_ROUTE_MULTIPATH /* - Dead device goes up. We wake up dead nexthops. - It takes sense only on multipath routes. + * Dead device goes up. We wake up dead nexthops. + * It takes sense only on multipath routes. */ - int fib_sync_up(struct net_device *dev) { struct fib_info *prev_fi; @@ -1123,7 +1144,7 @@ int fib_sync_up(struct net_device *dev) struct fib_nh *nh; int ret; - if (!(dev->flags&IFF_UP)) + if (!(dev->flags & IFF_UP)) return 0; prev_fi = NULL; @@ -1142,12 +1163,12 @@ int fib_sync_up(struct net_device *dev) prev_fi = fi; alive = 0; change_nexthops(fi) { - if (!(nexthop_nh->nh_flags&RTNH_F_DEAD)) { + if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) { alive++; continue; } if (nexthop_nh->nh_dev == NULL || - !(nexthop_nh->nh_dev->flags&IFF_UP)) + !(nexthop_nh->nh_dev->flags & IFF_UP)) continue; if (nexthop_nh->nh_dev != dev || !__in_dev_get_rtnl(dev)) @@ -1169,10 +1190,9 @@ int fib_sync_up(struct net_device *dev) } /* - The algorithm is suboptimal, but it provides really - fair weighted route distribution. + * The algorithm is suboptimal, but it provides really + * fair weighted route distribution. */ - void fib_select_multipath(const struct flowi *flp, struct fib_result *res) { struct fib_info *fi = res->fi; @@ -1182,7 +1202,7 @@ void fib_select_multipath(const struct flowi *flp, struct fib_result *res) if (fi->fib_power <= 0) { int power = 0; change_nexthops(fi) { - if (!(nexthop_nh->nh_flags&RTNH_F_DEAD)) { + if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) { power += nexthop_nh->nh_weight; nexthop_nh->nh_power = nexthop_nh->nh_weight; } @@ -1198,15 +1218,16 @@ void fib_select_multipath(const struct flowi *flp, struct fib_result *res) /* w should be random number [0..fi->fib_power-1], - it is pretty bad approximation. + * it is pretty bad approximation. */ w = jiffies % fi->fib_power; change_nexthops(fi) { - if (!(nexthop_nh->nh_flags&RTNH_F_DEAD) && + if (!(nexthop_nh->nh_flags & RTNH_F_DEAD) && nexthop_nh->nh_power) { - if ((w -= nexthop_nh->nh_power) <= 0) { + w -= nexthop_nh->nh_power; + if (w <= 0) { nexthop_nh->nh_power--; fi->fib_power--; res->nh_sel = nhsel; diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 79d057a939ba..200eb538fbb3 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -16,7 +16,7 @@ * * An experimental study of compression methods for dynamic tries * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002. - * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/ + * http://www.csc.kth.se/~snilsson/software/dyntrie2/ * * * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson @@ -186,7 +186,7 @@ static inline struct tnode *node_parent_rcu(struct node *node) { struct tnode *ret = node_parent(node); - return rcu_dereference(ret); + return rcu_dereference_rtnl(ret); } /* Same as rcu_assign_pointer @@ -209,9 +209,7 @@ static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i) { struct node *ret = tnode_get_child(tn, i); - return rcu_dereference_check(ret, - rcu_read_lock_held() || - lockdep_rtnl_is_held()); + return rcu_dereference_rtnl(ret); } static inline int tnode_child_length(const struct tnode *tn) @@ -457,8 +455,8 @@ static struct tnode *tnode_new(t_key key, int pos, int bits) tn->empty_children = 1<<bits; } - pr_debug("AT %p s=%u %lu\n", tn, (unsigned int) sizeof(struct tnode), - (unsigned long) (sizeof(struct node) << bits)); + pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode), + sizeof(struct node) << bits); return tn; } @@ -607,11 +605,10 @@ static struct node *resize(struct trie *t, struct tnode *tn) /* Keep root node larger */ - if (!node_parent((struct node*) tn)) { + if (!node_parent((struct node *)tn)) { inflate_threshold_use = inflate_threshold_root; halve_threshold_use = halve_threshold_root; - } - else { + } else { inflate_threshold_use = inflate_threshold; halve_threshold_use = halve_threshold; } @@ -637,7 +634,7 @@ static struct node *resize(struct trie *t, struct tnode *tn) check_tnode(tn); /* Return if at least one inflate is run */ - if( max_work != MAX_WORK) + if (max_work != MAX_WORK) return (struct node *) tn; /* @@ -964,9 +961,7 @@ fib_find_node(struct trie *t, u32 key) struct node *n; pos = 0; - n = rcu_dereference_check(t->trie, - rcu_read_lock_held() || - lockdep_rtnl_is_held()); + n = rcu_dereference_rtnl(t->trie); while (n != NULL && NODE_TYPE(n) == T_TNODE) { tn = (struct tnode *) n; @@ -1347,7 +1342,7 @@ err: /* should be called with rcu_read_lock */ static int check_leaf(struct trie *t, struct leaf *l, t_key key, const struct flowi *flp, - struct fib_result *res) + struct fib_result *res, int fib_flags) { struct leaf_info *li; struct hlist_head *hhead = &l->list; @@ -1361,7 +1356,7 @@ static int check_leaf(struct trie *t, struct leaf *l, if (l->key != (key & ntohl(mask))) continue; - err = fib_semantic_match(&li->falh, flp, res, plen); + err = fib_semantic_match(&li->falh, flp, res, plen, fib_flags); #ifdef CONFIG_IP_FIB_TRIE_STATS if (err <= 0) @@ -1377,7 +1372,7 @@ static int check_leaf(struct trie *t, struct leaf *l, } int fib_table_lookup(struct fib_table *tb, const struct flowi *flp, - struct fib_result *res) + struct fib_result *res, int fib_flags) { struct trie *t = (struct trie *) tb->tb_data; int ret; @@ -1389,8 +1384,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp, t_key cindex = 0; int current_prefix_length = KEYLENGTH; struct tnode *cn; - t_key node_prefix, key_prefix, pref_mismatch; - int mp; + t_key pref_mismatch; rcu_read_lock(); @@ -1404,7 +1398,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp, /* Just a leaf? */ if (IS_LEAF(n)) { - ret = check_leaf(t, (struct leaf *)n, key, flp, res); + ret = check_leaf(t, (struct leaf *)n, key, flp, res, fib_flags); goto found; } @@ -1429,7 +1423,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp, } if (IS_LEAF(n)) { - ret = check_leaf(t, (struct leaf *)n, key, flp, res); + ret = check_leaf(t, (struct leaf *)n, key, flp, res, fib_flags); if (ret > 0) goto backtrace; goto found; @@ -1505,10 +1499,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp, * matching prefix. */ - node_prefix = mask_pfx(cn->key, cn->pos); - key_prefix = mask_pfx(key, cn->pos); - pref_mismatch = key_prefix^node_prefix; - mp = 0; + pref_mismatch = mask_pfx(cn->key ^ key, cn->pos); /* * In short: If skipped bits in this node do not match @@ -1516,13 +1507,9 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp, * state.directly. */ if (pref_mismatch) { - while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) { - mp++; - pref_mismatch = pref_mismatch << 1; - } - key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp); + int mp = KEYLENGTH - fls(pref_mismatch); - if (key_prefix != 0) + if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0) goto backtrace; if (current_prefix_length >= cn->pos) @@ -1746,14 +1733,14 @@ static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c) /* Node empty, walk back up to parent */ c = (struct node *) p; - } while ( (p = node_parent_rcu(c)) != NULL); + } while ((p = node_parent_rcu(c)) != NULL); return NULL; /* Root of trie */ } static struct leaf *trie_firstleaf(struct trie *t) { - struct tnode *n = (struct tnode *) rcu_dereference(t->trie); + struct tnode *n = (struct tnode *)rcu_dereference_rtnl(t->trie); if (!n) return NULL; @@ -1810,6 +1797,11 @@ int fib_table_flush(struct fib_table *tb) return found; } +void fib_free_table(struct fib_table *tb) +{ + kfree(tb); +} + void fib_table_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res) @@ -1851,7 +1843,8 @@ void fib_table_select_default(struct fib_table *tb, if (!next_fi->fib_nh[0].nh_gw || next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK) continue; - fa->fa_state |= FA_S_ACCESSED; + + fib_alias_accessed(fa); if (fi == NULL) { if (next_fi != res->fi) @@ -2039,14 +2032,14 @@ struct fib_trie_iter { struct seq_net_private p; struct fib_table *tb; struct tnode *tnode; - unsigned index; - unsigned depth; + unsigned int index; + unsigned int depth; }; static struct node *fib_trie_get_next(struct fib_trie_iter *iter) { struct tnode *tn = iter->tnode; - unsigned cindex = iter->index; + unsigned int cindex = iter->index; struct tnode *p; /* A single entry routing table */ @@ -2155,7 +2148,7 @@ static void trie_collect_stats(struct trie *t, struct trie_stat *s) */ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat) { - unsigned i, max, pointers, bytes, avdepth; + unsigned int i, max, pointers, bytes, avdepth; if (stat->leaves) avdepth = stat->totdepth*100 / stat->leaves; @@ -2352,7 +2345,8 @@ static void fib_trie_seq_stop(struct seq_file *seq, void *v) static void seq_indent(struct seq_file *seq, int n) { - while (n-- > 0) seq_puts(seq, " "); + while (n-- > 0) + seq_puts(seq, " "); } static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s) @@ -2384,7 +2378,7 @@ static const char *const rtn_type_names[__RTN_MAX] = { [RTN_XRESOLVE] = "XRESOLVE", }; -static inline const char *rtn_type(char *buf, size_t len, unsigned t) +static inline const char *rtn_type(char *buf, size_t len, unsigned int t) { if (t < __RTN_MAX && rtn_type_names[t]) return rtn_type_names[t]; @@ -2540,13 +2534,12 @@ static void fib_route_seq_stop(struct seq_file *seq, void *v) rcu_read_unlock(); } -static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi) +static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi) { - static unsigned type2flags[RTN_MAX + 1] = { - [7] = RTF_REJECT, [8] = RTF_REJECT, - }; - unsigned flags = type2flags[type]; + unsigned int flags = 0; + if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT) + flags = RTF_REJECT; if (fi && fi->fib_nh->nh_gw) flags |= RTF_GATEWAY; if (mask == htonl(0xFFFFFFFF)) @@ -2558,7 +2551,7 @@ static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi) /* * This outputs /proc/net/route. * The format of the file is not supposed to be changed - * and needs to be same as fib_hash output to avoid breaking + * and needs to be same as fib_hash output to avoid breaking * legacy utilities */ static int fib_route_seq_show(struct seq_file *seq, void *v) @@ -2583,7 +2576,7 @@ static int fib_route_seq_show(struct seq_file *seq, void *v) list_for_each_entry_rcu(fa, &li->falh, fa_list) { const struct fib_info *fi = fa->fa_info; - unsigned flags = fib_flag_trans(fa->fa_type, mask, fi); + unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi); int len; if (fa->fa_type == RTN_BROADCAST diff --git a/net/ipv4/gre.c b/net/ipv4/gre.c new file mode 100644 index 000000000000..c6933f2ea310 --- /dev/null +++ b/net/ipv4/gre.c @@ -0,0 +1,152 @@ +/* + * GRE over IPv4 demultiplexer driver + * + * Authors: Dmitry Kozlov (xeb@mail.ru) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/kmod.h> +#include <linux/skbuff.h> +#include <linux/in.h> +#include <linux/netdevice.h> +#include <linux/version.h> +#include <linux/spinlock.h> +#include <net/protocol.h> +#include <net/gre.h> + + +static const struct gre_protocol __rcu *gre_proto[GREPROTO_MAX] __read_mostly; +static DEFINE_SPINLOCK(gre_proto_lock); + +int gre_add_protocol(const struct gre_protocol *proto, u8 version) +{ + if (version >= GREPROTO_MAX) + goto err_out; + + spin_lock(&gre_proto_lock); + if (gre_proto[version]) + goto err_out_unlock; + + rcu_assign_pointer(gre_proto[version], proto); + spin_unlock(&gre_proto_lock); + return 0; + +err_out_unlock: + spin_unlock(&gre_proto_lock); +err_out: + return -1; +} +EXPORT_SYMBOL_GPL(gre_add_protocol); + +int gre_del_protocol(const struct gre_protocol *proto, u8 version) +{ + if (version >= GREPROTO_MAX) + goto err_out; + + spin_lock(&gre_proto_lock); + if (rcu_dereference_protected(gre_proto[version], + lockdep_is_held(&gre_proto_lock)) != proto) + goto err_out_unlock; + rcu_assign_pointer(gre_proto[version], NULL); + spin_unlock(&gre_proto_lock); + synchronize_rcu(); + return 0; + +err_out_unlock: + spin_unlock(&gre_proto_lock); +err_out: + return -1; +} +EXPORT_SYMBOL_GPL(gre_del_protocol); + +static int gre_rcv(struct sk_buff *skb) +{ + const struct gre_protocol *proto; + u8 ver; + int ret; + + if (!pskb_may_pull(skb, 12)) + goto drop; + + ver = skb->data[1]&0x7f; + if (ver >= GREPROTO_MAX) + goto drop; + + rcu_read_lock(); + proto = rcu_dereference(gre_proto[ver]); + if (!proto || !proto->handler) + goto drop_unlock; + ret = proto->handler(skb); + rcu_read_unlock(); + return ret; + +drop_unlock: + rcu_read_unlock(); +drop: + kfree_skb(skb); + return NET_RX_DROP; +} + +static void gre_err(struct sk_buff *skb, u32 info) +{ + const struct gre_protocol *proto; + u8 ver; + + if (!pskb_may_pull(skb, 12)) + goto drop; + + ver = skb->data[1]&0x7f; + if (ver >= GREPROTO_MAX) + goto drop; + + rcu_read_lock(); + proto = rcu_dereference(gre_proto[ver]); + if (!proto || !proto->err_handler) + goto drop_unlock; + proto->err_handler(skb, info); + rcu_read_unlock(); + return; + +drop_unlock: + rcu_read_unlock(); +drop: + kfree_skb(skb); +} + +static const struct net_protocol net_gre_protocol = { + .handler = gre_rcv, + .err_handler = gre_err, + .netns_ok = 1, +}; + +static int __init gre_init(void) +{ + pr_info("GRE over IPv4 demultiplexor driver"); + + if (inet_add_protocol(&net_gre_protocol, IPPROTO_GRE) < 0) { + pr_err("gre: can't add protocol\n"); + return -EAGAIN; + } + + return 0; +} + +static void __exit gre_exit(void) +{ + inet_del_protocol(&net_gre_protocol, IPPROTO_GRE); +} + +module_init(gre_init); +module_exit(gre_exit); + +MODULE_DESCRIPTION("GRE over IPv4 demultiplexer driver"); +MODULE_AUTHOR("D. Kozlov (xeb@mail.ru)"); +MODULE_LICENSE("GPL"); + diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index d65e9215bcd7..96bc7f9475a3 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -181,6 +181,7 @@ const struct icmp_err icmp_err_convert[] = { .fatal = 1, }, }; +EXPORT_SYMBOL(icmp_err_convert); /* * ICMP control array. This specifies what to do with each ICMP. @@ -267,11 +268,12 @@ int xrlim_allow(struct dst_entry *dst, int timeout) dst->rate_tokens = token; return rc; } +EXPORT_SYMBOL(xrlim_allow); static inline int icmpv4_xrlim_allow(struct net *net, struct rtable *rt, int type, int code) { - struct dst_entry *dst = &rt->u.dst; + struct dst_entry *dst = &rt->dst; int rc = 1; if (type > NR_ICMP_TYPES) @@ -327,7 +329,7 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param, struct sock *sk; struct sk_buff *skb; - sk = icmp_sk(dev_net((*rt)->u.dst.dev)); + sk = icmp_sk(dev_net((*rt)->dst.dev)); if (ip_append_data(sk, icmp_glue_bits, icmp_param, icmp_param->data_len+icmp_param->head_len, icmp_param->head_len, @@ -359,7 +361,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) { struct ipcm_cookie ipc; struct rtable *rt = skb_rtable(skb); - struct net *net = dev_net(rt->u.dst.dev); + struct net *net = dev_net(rt->dst.dev); struct sock *sk; struct inet_sock *inet; __be32 daddr; @@ -377,7 +379,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) inet->tos = ip_hdr(skb)->tos; daddr = ipc.addr = rt->rt_src; ipc.opt = NULL; - ipc.shtx.flags = 0; + ipc.tx_flags = 0; if (icmp_param->replyopts.optlen) { ipc.opt = &icmp_param->replyopts; if (ipc.opt->srr) @@ -427,7 +429,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) if (!rt) goto out; - net = dev_net(rt->u.dst.dev); + net = dev_net(rt->dst.dev); /* * Find the original header. It is expected to be valid, of course. @@ -536,7 +538,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) inet_sk(sk)->tos = tos; ipc.addr = iph->saddr; ipc.opt = &icmp_param.replyopts; - ipc.shtx.flags = 0; + ipc.tx_flags = 0; { struct flowi fl = { @@ -596,9 +598,9 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) /* Ugh! */ orefdst = skb_in->_skb_refdst; /* save old refdst */ err = ip_route_input(skb_in, fl.fl4_dst, fl.fl4_src, - RT_TOS(tos), rt2->u.dst.dev); + RT_TOS(tos), rt2->dst.dev); - dst_release(&rt2->u.dst); + dst_release(&rt2->dst); rt2 = skb_rtable(skb_in); skb_in->_skb_refdst = orefdst; /* restore old refdst */ } @@ -610,7 +612,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) XFRM_LOOKUP_ICMP); switch (err) { case 0: - dst_release(&rt->u.dst); + dst_release(&rt->dst); rt = rt2; break; case -EPERM: @@ -629,7 +631,7 @@ route_done: /* RFC says return as much as we can without exceeding 576 bytes. */ - room = dst_mtu(&rt->u.dst); + room = dst_mtu(&rt->dst); if (room > 576) room = 576; room -= sizeof(struct iphdr) + icmp_param.replyopts.optlen; @@ -647,6 +649,7 @@ out_unlock: icmp_xmit_unlock(sk); out:; } +EXPORT_SYMBOL(icmp_send); /* @@ -925,6 +928,7 @@ static void icmp_address(struct sk_buff *skb) /* * RFC1812 (4.3.3.9). A router SHOULD listen all replies, and complain * loudly if an inconsistency is found. + * called with rcu_read_lock() */ static void icmp_address_reply(struct sk_buff *skb) @@ -935,12 +939,12 @@ static void icmp_address_reply(struct sk_buff *skb) struct in_ifaddr *ifa; if (skb->len < 4 || !(rt->rt_flags&RTCF_DIRECTSRC)) - goto out; + return; - in_dev = in_dev_get(dev); + in_dev = __in_dev_get_rcu(dev); if (!in_dev) - goto out; - rcu_read_lock(); + return; + if (in_dev->ifa_list && IN_DEV_LOG_MARTIANS(in_dev) && IN_DEV_FORWARD(in_dev)) { @@ -958,9 +962,6 @@ static void icmp_address_reply(struct sk_buff *skb) mp, dev->name, &rt->rt_src); } } - rcu_read_unlock(); - in_dev_put(in_dev); -out:; } static void icmp_discard(struct sk_buff *skb) @@ -974,7 +975,7 @@ int icmp_rcv(struct sk_buff *skb) { struct icmphdr *icmph; struct rtable *rt = skb_rtable(skb); - struct net *net = dev_net(rt->u.dst.dev); + struct net *net = dev_net(rt->dst.dev); if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { struct sec_path *sp = skb_sec_path(skb); @@ -1216,7 +1217,3 @@ int __init icmp_init(void) { return register_pernet_subsys(&icmp_sk_ops); } - -EXPORT_SYMBOL(icmp_err_convert); -EXPORT_SYMBOL(icmp_send); -EXPORT_SYMBOL(xrlim_allow); diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 5fff865a4fa7..c8877c6c7216 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -312,7 +312,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) return NULL; } - skb_dst_set(skb, &rt->u.dst); + skb_dst_set(skb, &rt->dst); skb->dev = dev; skb_reserve(skb, LL_RESERVED_SPACE(dev)); @@ -330,7 +330,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) pip->saddr = rt->rt_src; pip->protocol = IPPROTO_IGMP; pip->tot_len = 0; /* filled in later */ - ip_select_ident(pip, &rt->u.dst, NULL); + ip_select_ident(pip, &rt->dst, NULL); ((u8*)&pip[1])[0] = IPOPT_RA; ((u8*)&pip[1])[1] = 4; ((u8*)&pip[1])[2] = 0; @@ -660,7 +660,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, return -1; } - skb_dst_set(skb, &rt->u.dst); + skb_dst_set(skb, &rt->dst); skb_reserve(skb, LL_RESERVED_SPACE(dev)); @@ -676,7 +676,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, iph->daddr = dst; iph->saddr = rt->rt_src; iph->protocol = IPPROTO_IGMP; - ip_select_ident(iph, &rt->u.dst, NULL); + ip_select_ident(iph, &rt->dst, NULL); ((u8*)&iph[1])[0] = IPOPT_RA; ((u8*)&iph[1])[1] = 4; ((u8*)&iph[1])[2] = 0; @@ -856,6 +856,18 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, igmpv3_clear_delrec(in_dev); } else if (len < 12) { return; /* ignore bogus packet; freed by caller */ + } else if (IGMP_V1_SEEN(in_dev)) { + /* This is a v3 query with v1 queriers present */ + max_delay = IGMP_Query_Response_Interval; + group = 0; + } else if (IGMP_V2_SEEN(in_dev)) { + /* this is a v3 query with v2 queriers present; + * Interpretation of the max_delay code is problematic here. + * A real v2 host would use ih_code directly, while v3 has a + * different encoding. We use the v3 encoding as more likely + * to be intended in a v3 query. + */ + max_delay = IGMPV3_MRC(ih3->code)*(HZ/IGMP_TIMER_SCALE); } else { /* v3 */ if (!pskb_may_pull(skb, sizeof(struct igmpv3_query))) return; @@ -916,18 +928,19 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, read_unlock(&in_dev->mc_list_lock); } +/* called in rcu_read_lock() section */ int igmp_rcv(struct sk_buff *skb) { /* This basically follows the spec line by line -- see RFC1112 */ struct igmphdr *ih; - struct in_device *in_dev = in_dev_get(skb->dev); + struct in_device *in_dev = __in_dev_get_rcu(skb->dev); int len = skb->len; if (in_dev == NULL) goto drop; if (!pskb_may_pull(skb, sizeof(struct igmphdr))) - goto drop_ref; + goto drop; switch (skb->ip_summed) { case CHECKSUM_COMPLETE: @@ -937,7 +950,7 @@ int igmp_rcv(struct sk_buff *skb) case CHECKSUM_NONE: skb->csum = 0; if (__skb_checksum_complete(skb)) - goto drop_ref; + goto drop; } ih = igmp_hdr(skb); @@ -957,7 +970,6 @@ int igmp_rcv(struct sk_buff *skb) break; case IGMP_PIM: #ifdef CONFIG_IP_PIMSM_V1 - in_dev_put(in_dev); return pim_rcv_v1(skb); #endif case IGMPV3_HOST_MEMBERSHIP_REPORT: @@ -971,8 +983,6 @@ int igmp_rcv(struct sk_buff *skb) break; } -drop_ref: - in_dev_put(in_dev); drop: kfree_skb(skb); return 0; @@ -1246,6 +1256,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) out: return; } +EXPORT_SYMBOL(ip_mc_inc_group); /* * Resend IGMP JOIN report; used for bonding. @@ -1258,16 +1269,17 @@ void ip_mc_rejoin_group(struct ip_mc_list *im) if (im->multiaddr == IGMP_ALL_HOSTS) return; - if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) { - igmp_mod_timer(im, IGMP_Initial_Report_Delay); - return; - } - /* else, v3 */ - im->crcount = in_dev->mr_qrv ? in_dev->mr_qrv : - IGMP_Unsolicited_Report_Count; - igmp_ifc_event(in_dev); + /* a failover is happening and switches + * must be notified immediately */ + if (IGMP_V1_SEEN(in_dev)) + igmp_send_report(in_dev, im, IGMP_HOST_MEMBERSHIP_REPORT); + else if (IGMP_V2_SEEN(in_dev)) + igmp_send_report(in_dev, im, IGMPV2_HOST_MEMBERSHIP_REPORT); + else + igmp_send_report(in_dev, im, IGMPV3_HOST_MEMBERSHIP_REPORT); #endif } +EXPORT_SYMBOL(ip_mc_rejoin_group); /* * A socket has left a multicast group on device dev @@ -1298,6 +1310,7 @@ void ip_mc_dec_group(struct in_device *in_dev, __be32 addr) } } } +EXPORT_SYMBOL(ip_mc_dec_group); /* Device changing type */ @@ -1405,6 +1418,7 @@ void ip_mc_destroy_dev(struct in_device *in_dev) write_unlock_bh(&in_dev->mc_list_lock); } +/* RTNL is locked */ static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr) { struct flowi fl = { .nl_u = { .ip4_u = @@ -1415,19 +1429,16 @@ static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr) if (imr->imr_ifindex) { idev = inetdev_by_index(net, imr->imr_ifindex); - if (idev) - __in_dev_put(idev); return idev; } if (imr->imr_address.s_addr) { - dev = ip_dev_find(net, imr->imr_address.s_addr); + dev = __ip_dev_find(net, imr->imr_address.s_addr, false); if (!dev) return NULL; - dev_put(dev); } if (!dev && !ip_route_output_key(net, &rt, &fl)) { - dev = rt->u.dst.dev; + dev = rt->dst.dev; ip_rt_put(rt); } if (dev) { @@ -1646,8 +1657,7 @@ static int sf_setstate(struct ip_mc_list *pmc) if (dpsf->sf_inaddr == psf->sf_inaddr) break; if (!dpsf) { - dpsf = (struct ip_sf_list *) - kmalloc(sizeof(*dpsf), GFP_ATOMIC); + dpsf = kmalloc(sizeof(*dpsf), GFP_ATOMIC); if (!dpsf) continue; *dpsf = *psf; @@ -1807,6 +1817,7 @@ done: rtnl_unlock(); return err; } +EXPORT_SYMBOL(ip_mc_join_group); static void ip_sf_socklist_reclaim(struct rcu_head *rp) { @@ -2679,8 +2690,3 @@ int __init igmp_mc_proc_init(void) return register_pernet_subsys(&igmp_net_ops); } #endif - -EXPORT_SYMBOL(ip_mc_dec_group); -EXPORT_SYMBOL(ip_mc_inc_group); -EXPORT_SYMBOL(ip_mc_join_group); -EXPORT_SYMBOL(ip_mc_rejoin_group); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 70eb3507c406..7174370b1195 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -84,7 +84,6 @@ int inet_csk_bind_conflict(const struct sock *sk, } return node != NULL; } - EXPORT_SYMBOL_GPL(inet_csk_bind_conflict); /* Obtain a reference to a local port for the given sock, @@ -212,7 +211,6 @@ fail: local_bh_enable(); return ret; } - EXPORT_SYMBOL_GPL(inet_csk_get_port); /* @@ -305,7 +303,6 @@ out_err: *err = error; goto out; } - EXPORT_SYMBOL(inet_csk_accept); /* @@ -327,7 +324,6 @@ void inet_csk_init_xmit_timers(struct sock *sk, setup_timer(&sk->sk_timer, keepalive_handler, (unsigned long)sk); icsk->icsk_pending = icsk->icsk_ack.pending = 0; } - EXPORT_SYMBOL(inet_csk_init_xmit_timers); void inet_csk_clear_xmit_timers(struct sock *sk) @@ -340,21 +336,18 @@ void inet_csk_clear_xmit_timers(struct sock *sk) sk_stop_timer(sk, &icsk->icsk_delack_timer); sk_stop_timer(sk, &sk->sk_timer); } - EXPORT_SYMBOL(inet_csk_clear_xmit_timers); void inet_csk_delete_keepalive_timer(struct sock *sk) { sk_stop_timer(sk, &sk->sk_timer); } - EXPORT_SYMBOL(inet_csk_delete_keepalive_timer); void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len) { sk_reset_timer(sk, &sk->sk_timer, jiffies + len); } - EXPORT_SYMBOL(inet_csk_reset_keepalive_timer); struct dst_entry *inet_csk_route_req(struct sock *sk, @@ -383,7 +376,7 @@ struct dst_entry *inet_csk_route_req(struct sock *sk, goto no_route; if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) goto route_err; - return &rt->u.dst; + return &rt->dst; route_err: ip_rt_put(rt); @@ -391,7 +384,6 @@ no_route: IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); return NULL; } - EXPORT_SYMBOL_GPL(inet_csk_route_req); static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport, @@ -433,7 +425,6 @@ struct request_sock *inet_csk_search_req(const struct sock *sk, return req; } - EXPORT_SYMBOL_GPL(inet_csk_search_req); void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, @@ -447,11 +438,11 @@ void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout); inet_csk_reqsk_queue_added(sk, timeout); } +EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add); /* Only thing we need from tcp.h */ extern int sysctl_tcp_synack_retries; -EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add); /* Decide when to expire the request and when to resend SYN-ACK */ static inline void syn_ack_recalc(struct request_sock *req, const int thresh, @@ -569,7 +560,6 @@ void inet_csk_reqsk_queue_prune(struct sock *parent, if (lopt->qlen) inet_csk_reset_keepalive_timer(parent, interval); } - EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_prune); struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req, @@ -599,7 +589,6 @@ struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req, } return newsk; } - EXPORT_SYMBOL_GPL(inet_csk_clone); /* @@ -630,7 +619,6 @@ void inet_csk_destroy_sock(struct sock *sk) percpu_counter_dec(sk->sk_prot->orphan_count); sock_put(sk); } - EXPORT_SYMBOL(inet_csk_destroy_sock); int inet_csk_listen_start(struct sock *sk, const int nr_table_entries) @@ -665,7 +653,6 @@ int inet_csk_listen_start(struct sock *sk, const int nr_table_entries) __reqsk_queue_destroy(&icsk->icsk_accept_queue); return -EADDRINUSE; } - EXPORT_SYMBOL_GPL(inet_csk_listen_start); /* @@ -720,7 +707,6 @@ void inet_csk_listen_stop(struct sock *sk) } WARN_ON(sk->sk_ack_backlog); } - EXPORT_SYMBOL_GPL(inet_csk_listen_stop); void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr) @@ -732,7 +718,6 @@ void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr) sin->sin_addr.s_addr = inet->inet_daddr; sin->sin_port = inet->inet_dport; } - EXPORT_SYMBOL_GPL(inet_csk_addr2sockaddr); #ifdef CONFIG_COMPAT @@ -747,7 +732,6 @@ int inet_csk_compat_getsockopt(struct sock *sk, int level, int optname, return icsk->icsk_af_ops->getsockopt(sk, level, optname, optval, optlen); } - EXPORT_SYMBOL_GPL(inet_csk_compat_getsockopt); int inet_csk_compat_setsockopt(struct sock *sk, int level, int optname, @@ -761,6 +745,5 @@ int inet_csk_compat_setsockopt(struct sock *sk, int level, int optname, return icsk->icsk_af_ops->setsockopt(sk, level, optname, optval, optlen); } - EXPORT_SYMBOL_GPL(inet_csk_compat_setsockopt); #endif diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index e5fa2ddce320..ba8042665849 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -425,7 +425,7 @@ static int inet_diag_bc_run(const void *bc, int len, bc += op->no; } } - return (len == 0); + return len == 0; } static int valid_cc(const void *bc, int len, int cc) diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index a2ca6aed763b..5ff2a51b6d0c 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -114,7 +114,6 @@ void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f) fq->last_in |= INET_FRAG_COMPLETE; } } - EXPORT_SYMBOL(inet_frag_kill); static inline void frag_kfree_skb(struct netns_frags *nf, struct inet_frags *f, diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index d3e160a88219..1b344f30b463 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -99,24 +99,46 @@ void inet_put_port(struct sock *sk) __inet_put_port(sk); local_bh_enable(); } - EXPORT_SYMBOL(inet_put_port); -void __inet_inherit_port(struct sock *sk, struct sock *child) +int __inet_inherit_port(struct sock *sk, struct sock *child) { struct inet_hashinfo *table = sk->sk_prot->h.hashinfo; - const int bhash = inet_bhashfn(sock_net(sk), inet_sk(child)->inet_num, + unsigned short port = inet_sk(child)->inet_num; + const int bhash = inet_bhashfn(sock_net(sk), port, table->bhash_size); struct inet_bind_hashbucket *head = &table->bhash[bhash]; struct inet_bind_bucket *tb; spin_lock(&head->lock); tb = inet_csk(sk)->icsk_bind_hash; + if (tb->port != port) { + /* NOTE: using tproxy and redirecting skbs to a proxy + * on a different listener port breaks the assumption + * that the listener socket's icsk_bind_hash is the same + * as that of the child socket. We have to look up or + * create a new bind bucket for the child here. */ + struct hlist_node *node; + inet_bind_bucket_for_each(tb, node, &head->chain) { + if (net_eq(ib_net(tb), sock_net(sk)) && + tb->port == port) + break; + } + if (!node) { + tb = inet_bind_bucket_create(table->bind_bucket_cachep, + sock_net(sk), head, port); + if (!tb) { + spin_unlock(&head->lock); + return -ENOMEM; + } + } + } sk_add_bind_node(child, &tb->owners); inet_csk(child)->icsk_bind_hash = tb; spin_unlock(&head->lock); -} + return 0; +} EXPORT_SYMBOL_GPL(__inet_inherit_port); static inline int compute_score(struct sock *sk, struct net *net, @@ -546,7 +568,6 @@ int inet_hash_connect(struct inet_timewait_death_row *death_row, return __inet_hash_connect(death_row, sk, inet_sk_port_offset(sk), __inet_check_established, __inet_hash_nolisten); } - EXPORT_SYMBOL_GPL(inet_hash_connect); void inet_hashinfo_init(struct inet_hashinfo *h) @@ -560,5 +581,4 @@ void inet_hashinfo_init(struct inet_hashinfo *h) i + LISTENING_NULLS_BASE); } } - EXPORT_SYMBOL_GPL(inet_hashinfo_init); diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index 6bcfe52a9c87..9e94d7cf4f8a 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -51,8 +51,8 @@ * lookups performed with disabled BHs. * * Serialisation issues. - * 1. Nodes may appear in the tree only with the pool write lock held. - * 2. Nodes may disappear from the tree only with the pool write lock held + * 1. Nodes may appear in the tree only with the pool lock held. + * 2. Nodes may disappear from the tree only with the pool lock held * AND reference count being 0. * 3. Nodes appears and disappears from unused node list only under * "inet_peer_unused_lock". @@ -64,23 +64,32 @@ * usually under some other lock to prevent node disappearing * dtime: unused node list lock * v4daddr: unchangeable - * ip_id_count: idlock + * ip_id_count: atomic value (no lock needed) */ static struct kmem_cache *peer_cachep __read_mostly; #define node_height(x) x->avl_height -static struct inet_peer peer_fake_node = { - .avl_left = &peer_fake_node, - .avl_right = &peer_fake_node, + +#define peer_avl_empty ((struct inet_peer *)&peer_fake_node) +#define peer_avl_empty_rcu ((struct inet_peer __rcu __force *)&peer_fake_node) +static const struct inet_peer peer_fake_node = { + .avl_left = peer_avl_empty_rcu, + .avl_right = peer_avl_empty_rcu, .avl_height = 0 }; -#define peer_avl_empty (&peer_fake_node) -static struct inet_peer *peer_root = peer_avl_empty; -static DEFINE_RWLOCK(peer_pool_lock); + +static struct { + struct inet_peer __rcu *root; + spinlock_t lock; + int total; +} peers = { + .root = peer_avl_empty_rcu, + .lock = __SPIN_LOCK_UNLOCKED(peers.lock), + .total = 0, +}; #define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */ -static int peer_total; /* Exported for sysctl_net_ipv4. */ int inet_peer_threshold __read_mostly = 65536 + 128; /* start to throw entries more * aggressively at this stage */ @@ -89,8 +98,13 @@ int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min int inet_peer_gc_mintime __read_mostly = 10 * HZ; int inet_peer_gc_maxtime __read_mostly = 120 * HZ; -static LIST_HEAD(unused_peers); -static DEFINE_SPINLOCK(inet_peer_unused_lock); +static struct { + struct list_head list; + spinlock_t lock; +} unused_peers = { + .list = LIST_HEAD_INIT(unused_peers.list), + .lock = __SPIN_LOCK_UNLOCKED(unused_peers.lock), +}; static void peer_check_expire(unsigned long dummy); static DEFINE_TIMER(peer_periodic_timer, peer_check_expire, 0, 0); @@ -116,7 +130,7 @@ void __init inet_initpeers(void) peer_cachep = kmem_cache_create("inet_peer_cache", sizeof(struct inet_peer), - 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, + 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); /* All the timers, started at system startup tend @@ -131,122 +145,174 @@ void __init inet_initpeers(void) /* Called with or without local BH being disabled. */ static void unlink_from_unused(struct inet_peer *p) { - spin_lock_bh(&inet_peer_unused_lock); - list_del_init(&p->unused); - spin_unlock_bh(&inet_peer_unused_lock); + if (!list_empty(&p->unused)) { + spin_lock_bh(&unused_peers.lock); + list_del_init(&p->unused); + spin_unlock_bh(&unused_peers.lock); + } } /* * Called with local BH disabled and the pool lock held. - * _stack is known to be NULL or not at compile time, - * so compiler will optimize the if (_stack) tests. */ #define lookup(_daddr, _stack) \ ({ \ - struct inet_peer *u, **v; \ - if (_stack != NULL) { \ - stackptr = _stack; \ - *stackptr++ = &peer_root; \ - } \ - for (u = peer_root; u != peer_avl_empty; ) { \ + struct inet_peer *u; \ + struct inet_peer __rcu **v; \ + \ + stackptr = _stack; \ + *stackptr++ = &peers.root; \ + for (u = rcu_dereference_protected(peers.root, \ + lockdep_is_held(&peers.lock)); \ + u != peer_avl_empty; ) { \ if (_daddr == u->v4daddr) \ break; \ if ((__force __u32)_daddr < (__force __u32)u->v4daddr) \ v = &u->avl_left; \ else \ v = &u->avl_right; \ - if (_stack != NULL) \ - *stackptr++ = v; \ - u = *v; \ + *stackptr++ = v; \ + u = rcu_dereference_protected(*v, \ + lockdep_is_held(&peers.lock)); \ } \ u; \ }) -/* Called with local BH disabled and the pool write lock held. */ +/* + * Called with rcu_read_lock_bh() + * Because we hold no lock against a writer, its quite possible we fall + * in an endless loop. + * But every pointer we follow is guaranteed to be valid thanks to RCU. + * We exit from this function if number of links exceeds PEER_MAXDEPTH + */ +static struct inet_peer *lookup_rcu_bh(__be32 daddr) +{ + struct inet_peer *u = rcu_dereference_bh(peers.root); + int count = 0; + + while (u != peer_avl_empty) { + if (daddr == u->v4daddr) { + /* Before taking a reference, check if this entry was + * deleted, unlink_from_pool() sets refcnt=-1 to make + * distinction between an unused entry (refcnt=0) and + * a freed one. + */ + if (unlikely(!atomic_add_unless(&u->refcnt, 1, -1))) + u = NULL; + return u; + } + if ((__force __u32)daddr < (__force __u32)u->v4daddr) + u = rcu_dereference_bh(u->avl_left); + else + u = rcu_dereference_bh(u->avl_right); + if (unlikely(++count == PEER_MAXDEPTH)) + break; + } + return NULL; +} + +/* Called with local BH disabled and the pool lock held. */ #define lookup_rightempty(start) \ ({ \ - struct inet_peer *u, **v; \ + struct inet_peer *u; \ + struct inet_peer __rcu **v; \ *stackptr++ = &start->avl_left; \ v = &start->avl_left; \ - for (u = *v; u->avl_right != peer_avl_empty; ) { \ + for (u = rcu_dereference_protected(*v, \ + lockdep_is_held(&peers.lock)); \ + u->avl_right != peer_avl_empty_rcu; ) { \ v = &u->avl_right; \ *stackptr++ = v; \ - u = *v; \ + u = rcu_dereference_protected(*v, \ + lockdep_is_held(&peers.lock)); \ } \ u; \ }) -/* Called with local BH disabled and the pool write lock held. +/* Called with local BH disabled and the pool lock held. * Variable names are the proof of operation correctness. - * Look into mm/map_avl.c for more detail description of the ideas. */ -static void peer_avl_rebalance(struct inet_peer **stack[], - struct inet_peer ***stackend) + * Look into mm/map_avl.c for more detail description of the ideas. + */ +static void peer_avl_rebalance(struct inet_peer __rcu **stack[], + struct inet_peer __rcu ***stackend) { - struct inet_peer **nodep, *node, *l, *r; + struct inet_peer __rcu **nodep; + struct inet_peer *node, *l, *r; int lh, rh; while (stackend > stack) { nodep = *--stackend; - node = *nodep; - l = node->avl_left; - r = node->avl_right; + node = rcu_dereference_protected(*nodep, + lockdep_is_held(&peers.lock)); + l = rcu_dereference_protected(node->avl_left, + lockdep_is_held(&peers.lock)); + r = rcu_dereference_protected(node->avl_right, + lockdep_is_held(&peers.lock)); lh = node_height(l); rh = node_height(r); if (lh > rh + 1) { /* l: RH+2 */ struct inet_peer *ll, *lr, *lrl, *lrr; int lrh; - ll = l->avl_left; - lr = l->avl_right; + ll = rcu_dereference_protected(l->avl_left, + lockdep_is_held(&peers.lock)); + lr = rcu_dereference_protected(l->avl_right, + lockdep_is_held(&peers.lock)); lrh = node_height(lr); if (lrh <= node_height(ll)) { /* ll: RH+1 */ - node->avl_left = lr; /* lr: RH or RH+1 */ - node->avl_right = r; /* r: RH */ + RCU_INIT_POINTER(node->avl_left, lr); /* lr: RH or RH+1 */ + RCU_INIT_POINTER(node->avl_right, r); /* r: RH */ node->avl_height = lrh + 1; /* RH+1 or RH+2 */ - l->avl_left = ll; /* ll: RH+1 */ - l->avl_right = node; /* node: RH+1 or RH+2 */ + RCU_INIT_POINTER(l->avl_left, ll); /* ll: RH+1 */ + RCU_INIT_POINTER(l->avl_right, node); /* node: RH+1 or RH+2 */ l->avl_height = node->avl_height + 1; - *nodep = l; + RCU_INIT_POINTER(*nodep, l); } else { /* ll: RH, lr: RH+1 */ - lrl = lr->avl_left; /* lrl: RH or RH-1 */ - lrr = lr->avl_right; /* lrr: RH or RH-1 */ - node->avl_left = lrr; /* lrr: RH or RH-1 */ - node->avl_right = r; /* r: RH */ + lrl = rcu_dereference_protected(lr->avl_left, + lockdep_is_held(&peers.lock)); /* lrl: RH or RH-1 */ + lrr = rcu_dereference_protected(lr->avl_right, + lockdep_is_held(&peers.lock)); /* lrr: RH or RH-1 */ + RCU_INIT_POINTER(node->avl_left, lrr); /* lrr: RH or RH-1 */ + RCU_INIT_POINTER(node->avl_right, r); /* r: RH */ node->avl_height = rh + 1; /* node: RH+1 */ - l->avl_left = ll; /* ll: RH */ - l->avl_right = lrl; /* lrl: RH or RH-1 */ + RCU_INIT_POINTER(l->avl_left, ll); /* ll: RH */ + RCU_INIT_POINTER(l->avl_right, lrl); /* lrl: RH or RH-1 */ l->avl_height = rh + 1; /* l: RH+1 */ - lr->avl_left = l; /* l: RH+1 */ - lr->avl_right = node; /* node: RH+1 */ + RCU_INIT_POINTER(lr->avl_left, l); /* l: RH+1 */ + RCU_INIT_POINTER(lr->avl_right, node); /* node: RH+1 */ lr->avl_height = rh + 2; - *nodep = lr; + RCU_INIT_POINTER(*nodep, lr); } } else if (rh > lh + 1) { /* r: LH+2 */ struct inet_peer *rr, *rl, *rlr, *rll; int rlh; - rr = r->avl_right; - rl = r->avl_left; + rr = rcu_dereference_protected(r->avl_right, + lockdep_is_held(&peers.lock)); + rl = rcu_dereference_protected(r->avl_left, + lockdep_is_held(&peers.lock)); rlh = node_height(rl); if (rlh <= node_height(rr)) { /* rr: LH+1 */ - node->avl_right = rl; /* rl: LH or LH+1 */ - node->avl_left = l; /* l: LH */ + RCU_INIT_POINTER(node->avl_right, rl); /* rl: LH or LH+1 */ + RCU_INIT_POINTER(node->avl_left, l); /* l: LH */ node->avl_height = rlh + 1; /* LH+1 or LH+2 */ - r->avl_right = rr; /* rr: LH+1 */ - r->avl_left = node; /* node: LH+1 or LH+2 */ + RCU_INIT_POINTER(r->avl_right, rr); /* rr: LH+1 */ + RCU_INIT_POINTER(r->avl_left, node); /* node: LH+1 or LH+2 */ r->avl_height = node->avl_height + 1; - *nodep = r; + RCU_INIT_POINTER(*nodep, r); } else { /* rr: RH, rl: RH+1 */ - rlr = rl->avl_right; /* rlr: LH or LH-1 */ - rll = rl->avl_left; /* rll: LH or LH-1 */ - node->avl_right = rll; /* rll: LH or LH-1 */ - node->avl_left = l; /* l: LH */ + rlr = rcu_dereference_protected(rl->avl_right, + lockdep_is_held(&peers.lock)); /* rlr: LH or LH-1 */ + rll = rcu_dereference_protected(rl->avl_left, + lockdep_is_held(&peers.lock)); /* rll: LH or LH-1 */ + RCU_INIT_POINTER(node->avl_right, rll); /* rll: LH or LH-1 */ + RCU_INIT_POINTER(node->avl_left, l); /* l: LH */ node->avl_height = lh + 1; /* node: LH+1 */ - r->avl_right = rr; /* rr: LH */ - r->avl_left = rlr; /* rlr: LH or LH-1 */ + RCU_INIT_POINTER(r->avl_right, rr); /* rr: LH */ + RCU_INIT_POINTER(r->avl_left, rlr); /* rlr: LH or LH-1 */ r->avl_height = lh + 1; /* r: LH+1 */ - rl->avl_right = r; /* r: LH+1 */ - rl->avl_left = node; /* node: LH+1 */ + RCU_INIT_POINTER(rl->avl_right, r); /* r: LH+1 */ + RCU_INIT_POINTER(rl->avl_left, node); /* node: LH+1 */ rl->avl_height = lh + 2; - *nodep = rl; + RCU_INIT_POINTER(*nodep, rl); } } else { node->avl_height = (lh > rh ? lh : rh) + 1; @@ -254,15 +320,21 @@ static void peer_avl_rebalance(struct inet_peer **stack[], } } -/* Called with local BH disabled and the pool write lock held. */ +/* Called with local BH disabled and the pool lock held. */ #define link_to_pool(n) \ do { \ n->avl_height = 1; \ - n->avl_left = peer_avl_empty; \ - n->avl_right = peer_avl_empty; \ - **--stackptr = n; \ + n->avl_left = peer_avl_empty_rcu; \ + n->avl_right = peer_avl_empty_rcu; \ + /* lockless readers can catch us now */ \ + rcu_assign_pointer(**--stackptr, n); \ peer_avl_rebalance(stack, stackptr); \ -} while(0) +} while (0) + +static void inetpeer_free_rcu(struct rcu_head *head) +{ + kmem_cache_free(peer_cachep, container_of(head, struct inet_peer, rcu)); +} /* May be called with local BH enabled. */ static void unlink_from_pool(struct inet_peer *p) @@ -271,31 +343,33 @@ static void unlink_from_pool(struct inet_peer *p) do_free = 0; - write_lock_bh(&peer_pool_lock); + spin_lock_bh(&peers.lock); /* Check the reference counter. It was artificially incremented by 1 - * in cleanup() function to prevent sudden disappearing. If the - * reference count is still 1 then the node is referenced only as `p' - * here and from the pool. So under the exclusive pool lock it's safe - * to remove the node and free it later. */ - if (atomic_read(&p->refcnt) == 1) { - struct inet_peer **stack[PEER_MAXDEPTH]; - struct inet_peer ***stackptr, ***delp; + * in cleanup() function to prevent sudden disappearing. If we can + * atomically (because of lockless readers) take this last reference, + * it's safe to remove the node and free it later. + * We use refcnt=-1 to alert lockless readers this entry is deleted. + */ + if (atomic_cmpxchg(&p->refcnt, 1, -1) == 1) { + struct inet_peer __rcu **stack[PEER_MAXDEPTH]; + struct inet_peer __rcu ***stackptr, ***delp; if (lookup(p->v4daddr, stack) != p) BUG(); delp = stackptr - 1; /* *delp[0] == p */ - if (p->avl_left == peer_avl_empty) { + if (p->avl_left == peer_avl_empty_rcu) { *delp[0] = p->avl_right; --stackptr; } else { /* look for a node to insert instead of p */ struct inet_peer *t; t = lookup_rightempty(p); - BUG_ON(*stackptr[-1] != t); + BUG_ON(rcu_dereference_protected(*stackptr[-1], + lockdep_is_held(&peers.lock)) != t); **--stackptr = t->avl_left; /* t is removed, t->v4daddr > x->v4daddr for any * x in p->avl_left subtree. * Put t in the old place of p. */ - *delp[0] = t; + RCU_INIT_POINTER(*delp[0], t); t->avl_left = p->avl_left; t->avl_right = p->avl_right; t->avl_height = p->avl_height; @@ -303,20 +377,21 @@ static void unlink_from_pool(struct inet_peer *p) delp[1] = &t->avl_left; /* was &p->avl_left */ } peer_avl_rebalance(stack, stackptr); - peer_total--; + peers.total--; do_free = 1; } - write_unlock_bh(&peer_pool_lock); + spin_unlock_bh(&peers.lock); if (do_free) - kmem_cache_free(peer_cachep, p); + call_rcu_bh(&p->rcu, inetpeer_free_rcu); else /* The node is used again. Decrease the reference counter * back. The loop "cleanup -> unlink_from_unused * -> unlink_from_pool -> putpeer -> link_to_unused * -> cleanup (for the same node)" * doesn't really exist because the entry will have a - * recent deletion time and will not be cleaned again soon. */ + * recent deletion time and will not be cleaned again soon. + */ inet_putpeer(p); } @@ -326,16 +401,16 @@ static int cleanup_once(unsigned long ttl) struct inet_peer *p = NULL; /* Remove the first entry from the list of unused nodes. */ - spin_lock_bh(&inet_peer_unused_lock); - if (!list_empty(&unused_peers)) { + spin_lock_bh(&unused_peers.lock); + if (!list_empty(&unused_peers.list)) { __u32 delta; - p = list_first_entry(&unused_peers, struct inet_peer, unused); + p = list_first_entry(&unused_peers.list, struct inet_peer, unused); delta = (__u32)jiffies - p->dtime; if (delta < ttl) { /* Do not prune fresh entries. */ - spin_unlock_bh(&inet_peer_unused_lock); + spin_unlock_bh(&unused_peers.lock); return -1; } @@ -345,7 +420,7 @@ static int cleanup_once(unsigned long ttl) * before unlink_from_pool() call. */ atomic_inc(&p->refcnt); } - spin_unlock_bh(&inet_peer_unused_lock); + spin_unlock_bh(&unused_peers.lock); if (p == NULL) /* It means that the total number of USED entries has @@ -360,62 +435,56 @@ static int cleanup_once(unsigned long ttl) /* Called with or without local BH being disabled. */ struct inet_peer *inet_getpeer(__be32 daddr, int create) { - struct inet_peer *p, *n; - struct inet_peer **stack[PEER_MAXDEPTH], ***stackptr; + struct inet_peer *p; + struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr; - /* Look up for the address quickly. */ - read_lock_bh(&peer_pool_lock); - p = lookup(daddr, NULL); - if (p != peer_avl_empty) - atomic_inc(&p->refcnt); - read_unlock_bh(&peer_pool_lock); + /* Look up for the address quickly, lockless. + * Because of a concurrent writer, we might not find an existing entry. + */ + rcu_read_lock_bh(); + p = lookup_rcu_bh(daddr); + rcu_read_unlock_bh(); + + if (p) { + /* The existing node has been found. + * Remove the entry from unused list if it was there. + */ + unlink_from_unused(p); + return p; + } + /* retry an exact lookup, taking the lock before. + * At least, nodes should be hot in our cache. + */ + spin_lock_bh(&peers.lock); + p = lookup(daddr, stack); if (p != peer_avl_empty) { - /* The existing node has been found. */ + atomic_inc(&p->refcnt); + spin_unlock_bh(&peers.lock); /* Remove the entry from unused list if it was there. */ unlink_from_unused(p); return p; } + p = create ? kmem_cache_alloc(peer_cachep, GFP_ATOMIC) : NULL; + if (p) { + p->v4daddr = daddr; + atomic_set(&p->refcnt, 1); + atomic_set(&p->rid, 0); + atomic_set(&p->ip_id_count, secure_ip_id(daddr)); + p->tcp_ts_stamp = 0; + INIT_LIST_HEAD(&p->unused); + + + /* Link the node. */ + link_to_pool(p); + peers.total++; + } + spin_unlock_bh(&peers.lock); - if (!create) - return NULL; - - /* Allocate the space outside the locked region. */ - n = kmem_cache_alloc(peer_cachep, GFP_ATOMIC); - if (n == NULL) - return NULL; - n->v4daddr = daddr; - atomic_set(&n->refcnt, 1); - atomic_set(&n->rid, 0); - atomic_set(&n->ip_id_count, secure_ip_id(daddr)); - n->tcp_ts_stamp = 0; - - write_lock_bh(&peer_pool_lock); - /* Check if an entry has suddenly appeared. */ - p = lookup(daddr, stack); - if (p != peer_avl_empty) - goto out_free; - - /* Link the node. */ - link_to_pool(n); - INIT_LIST_HEAD(&n->unused); - peer_total++; - write_unlock_bh(&peer_pool_lock); - - if (peer_total >= inet_peer_threshold) + if (peers.total >= inet_peer_threshold) /* Remove one less-recently-used entry. */ cleanup_once(0); - return n; - -out_free: - /* The appropriate node is already in the pool. */ - atomic_inc(&p->refcnt); - write_unlock_bh(&peer_pool_lock); - /* Remove the entry from unused list if it was there. */ - unlink_from_unused(p); - /* Free preallocated the preallocated node. */ - kmem_cache_free(peer_cachep, n); return p; } @@ -425,12 +494,12 @@ static void peer_check_expire(unsigned long dummy) unsigned long now = jiffies; int ttl; - if (peer_total >= inet_peer_threshold) + if (peers.total >= inet_peer_threshold) ttl = inet_peer_minttl; else ttl = inet_peer_maxttl - (inet_peer_maxttl - inet_peer_minttl) / HZ * - peer_total / inet_peer_threshold * HZ; + peers.total / inet_peer_threshold * HZ; while (!cleanup_once(ttl)) { if (jiffies != now) break; @@ -439,22 +508,25 @@ static void peer_check_expire(unsigned long dummy) /* Trigger the timer after inet_peer_gc_mintime .. inet_peer_gc_maxtime * interval depending on the total number of entries (more entries, * less interval). */ - if (peer_total >= inet_peer_threshold) + if (peers.total >= inet_peer_threshold) peer_periodic_timer.expires = jiffies + inet_peer_gc_mintime; else peer_periodic_timer.expires = jiffies + inet_peer_gc_maxtime - (inet_peer_gc_maxtime - inet_peer_gc_mintime) / HZ * - peer_total / inet_peer_threshold * HZ; + peers.total / inet_peer_threshold * HZ; add_timer(&peer_periodic_timer); } void inet_putpeer(struct inet_peer *p) { - spin_lock_bh(&inet_peer_unused_lock); - if (atomic_dec_and_test(&p->refcnt)) { - list_add_tail(&p->unused, &unused_peers); + local_bh_disable(); + + if (atomic_dec_and_lock(&p->refcnt, &unused_peers.lock)) { + list_add_tail(&p->unused, &unused_peers.list); p->dtime = (__u32)jiffies; + spin_unlock(&unused_peers.lock); } - spin_unlock_bh(&inet_peer_unused_lock); + + local_bh_enable(); } diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 56cdf68a074c..99461f09320f 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -87,16 +87,16 @@ int ip_forward(struct sk_buff *skb) if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway) goto sr_failed; - if (unlikely(skb->len > dst_mtu(&rt->u.dst) && !skb_is_gso(skb) && + if (unlikely(skb->len > dst_mtu(&rt->dst) && !skb_is_gso(skb) && (ip_hdr(skb)->frag_off & htons(IP_DF))) && !skb->local_df) { - IP_INC_STATS(dev_net(rt->u.dst.dev), IPSTATS_MIB_FRAGFAILS); + IP_INC_STATS(dev_net(rt->dst.dev), IPSTATS_MIB_FRAGFAILS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, - htonl(dst_mtu(&rt->u.dst))); + htonl(dst_mtu(&rt->dst))); goto drop; } /* We are about to mangle packet. Copy it! */ - if (skb_cow(skb, LL_RESERVED_SPACE(rt->u.dst.dev)+rt->u.dst.header_len)) + if (skb_cow(skb, LL_RESERVED_SPACE(rt->dst.dev)+rt->dst.header_len)) goto drop; iph = ip_hdr(skb); @@ -113,7 +113,7 @@ int ip_forward(struct sk_buff *skb) skb->priority = rt_tos2priority(iph->tos); return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, skb, skb->dev, - rt->u.dst.dev, ip_forward_finish); + rt->dst.dev, ip_forward_finish); sr_failed: /* diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 75347ea70ea0..168440834ade 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -116,19 +116,16 @@ static int ip4_frag_match(struct inet_frag_queue *q, void *a) struct ip4_create_arg *arg = a; qp = container_of(q, struct ipq, q); - return (qp->id == arg->iph->id && + return qp->id == arg->iph->id && qp->saddr == arg->iph->saddr && qp->daddr == arg->iph->daddr && qp->protocol == arg->iph->protocol && - qp->user == arg->user); + qp->user == arg->user; } /* Memory Tracking Functions. */ -static __inline__ void frag_kfree_skb(struct netns_frags *nf, - struct sk_buff *skb, int *work) +static void frag_kfree_skb(struct netns_frags *nf, struct sk_buff *skb) { - if (work) - *work -= skb->truesize; atomic_sub(skb->truesize, &nf->mem); kfree_skb(skb); } @@ -309,7 +306,7 @@ static int ip_frag_reinit(struct ipq *qp) fp = qp->q.fragments; do { struct sk_buff *xp = fp->next; - frag_kfree_skb(qp->q.net, fp, NULL); + frag_kfree_skb(qp->q.net, fp); fp = xp; } while (fp); @@ -317,6 +314,7 @@ static int ip_frag_reinit(struct ipq *qp) qp->q.len = 0; qp->q.meat = 0; qp->q.fragments = NULL; + qp->q.fragments_tail = NULL; qp->iif = 0; return 0; @@ -389,6 +387,11 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) * in the chain of fragments so far. We must know where to put * this fragment, right? */ + prev = qp->q.fragments_tail; + if (!prev || FRAG_CB(prev)->offset < offset) { + next = NULL; + goto found; + } prev = NULL; for (next = qp->q.fragments; next != NULL; next = next->next) { if (FRAG_CB(next)->offset >= offset) @@ -396,6 +399,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) prev = next; } +found: /* We found where to put this one. Check for overlap with * preceding fragment, and, if needed, align things so that * any overlaps are eliminated. @@ -446,7 +450,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) qp->q.fragments = next; qp->q.meat -= free_it->len; - frag_kfree_skb(qp->q.net, free_it, NULL); + frag_kfree_skb(qp->q.net, free_it); } } @@ -454,6 +458,8 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) /* Insert this fragment in the chain of fragments. */ skb->next = next; + if (!next) + qp->q.fragments_tail = skb; if (prev) prev->next = skb; else @@ -507,6 +513,8 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, goto out_nomem; fp->next = head->next; + if (!fp->next) + qp->q.fragments_tail = fp; prev->next = fp; skb_morph(head, qp->q.fragments); @@ -534,7 +542,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, /* If the first fragment is fragmented itself, we split * it to two chunks: the first with data and paged part * and the second, holding only fragments. */ - if (skb_has_frags(head)) { + if (skb_has_frag_list(head)) { struct sk_buff *clone; int i, plen = 0; @@ -556,7 +564,6 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, skb_shinfo(head)->frag_list = head->next; skb_push(head, head->data - skb_network_header(head)); - atomic_sub(head->truesize, &qp->q.net->mem); for (fp=head->next; fp; fp = fp->next) { head->data_len += fp->len; @@ -566,8 +573,8 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, else if (head->ip_summed == CHECKSUM_COMPLETE) head->csum = csum_add(head->csum, fp->csum); head->truesize += fp->truesize; - atomic_sub(fp->truesize, &qp->q.net->mem); } + atomic_sub(head->truesize, &qp->q.net->mem); head->next = NULL; head->dev = dev; @@ -578,6 +585,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, iph->tot_len = htons(len); IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS); qp->q.fragments = NULL; + qp->q.fragments_tail = NULL; return 0; out_nomem: @@ -624,6 +632,7 @@ int ip_defrag(struct sk_buff *skb, u32 user) kfree_skb(skb); return -ENOMEM; } +EXPORT_SYMBOL(ip_defrag); #ifdef CONFIG_SYSCTL static int zero; @@ -777,5 +786,3 @@ void __init ipfrag_init(void) ip4_frags.secret_interval = 10 * 60 * HZ; inet_frags_init(&ip4_frags); } - -EXPORT_SYMBOL(ip_defrag); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 32618e11076d..70ff77f02eee 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -44,8 +44,9 @@ #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/rtnetlink.h> +#include <net/gre.h> -#ifdef CONFIG_IPV6 +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) #include <net/ipv6.h> #include <net/ip6_fib.h> #include <net/ip6_route.h> @@ -63,13 +64,13 @@ We cannot track such dead loops during route installation, it is infeasible task. The most general solutions would be to keep skb->encapsulation counter (sort of local ttl), - and silently drop packet when it expires. It is the best + and silently drop packet when it expires. It is a good solution, but it supposes maintaing new variable in ALL skb, even if no tunneling is used. - Current solution: HARD_TX_LOCK lock breaks dead loops. - - + Current solution: xmit_recursion breaks dead loops. This is a percpu + counter, since when we enter the first ndo_xmit(), cpu migration is + forbidden. We force an exit if this counter reaches RECURSION_LIMIT 2. Networking dead loops would not kill routers, but would really kill network. IP hop limit plays role of "t->recursion" in this case, @@ -128,7 +129,7 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev); static int ipgre_net_id __read_mostly; struct ipgre_net { - struct ip_tunnel *tunnels[4][HASH_SIZE]; + struct ip_tunnel __rcu *tunnels[4][HASH_SIZE]; struct net_device *fb_tunnel_dev; }; @@ -158,13 +159,40 @@ struct ipgre_net { #define tunnels_l tunnels[1] #define tunnels_wc tunnels[0] /* - * Locking : hash tables are protected by RCU and a spinlock + * Locking : hash tables are protected by RCU and RTNL */ -static DEFINE_SPINLOCK(ipgre_lock); #define for_each_ip_tunnel_rcu(start) \ for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) +/* often modified stats are per cpu, other are shared (netdev->stats) */ +struct pcpu_tstats { + unsigned long rx_packets; + unsigned long rx_bytes; + unsigned long tx_packets; + unsigned long tx_bytes; +}; + +static struct net_device_stats *ipgre_get_stats(struct net_device *dev) +{ + struct pcpu_tstats sum = { 0 }; + int i; + + for_each_possible_cpu(i) { + const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i); + + sum.rx_packets += tstats->rx_packets; + sum.rx_bytes += tstats->rx_bytes; + sum.tx_packets += tstats->tx_packets; + sum.tx_bytes += tstats->tx_bytes; + } + dev->stats.rx_packets = sum.rx_packets; + dev->stats.rx_bytes = sum.rx_bytes; + dev->stats.tx_packets = sum.tx_packets; + dev->stats.tx_bytes = sum.tx_bytes; + return &dev->stats; +} + /* Given src, dst and key, find appropriate for input tunnel. */ static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev, @@ -173,8 +201,8 @@ static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev, { struct net *net = dev_net(dev); int link = dev->ifindex; - unsigned h0 = HASH(remote); - unsigned h1 = HASH(key); + unsigned int h0 = HASH(remote); + unsigned int h1 = HASH(key); struct ip_tunnel *t, *cand = NULL; struct ipgre_net *ign = net_generic(net, ipgre_net_id); int dev_type = (gre_proto == htons(ETH_P_TEB)) ? @@ -289,13 +317,13 @@ static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev, return NULL; } -static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign, +static struct ip_tunnel __rcu **__ipgre_bucket(struct ipgre_net *ign, struct ip_tunnel_parm *parms) { __be32 remote = parms->iph.daddr; __be32 local = parms->iph.saddr; __be32 key = parms->i_key; - unsigned h = HASH(key); + unsigned int h = HASH(key); int prio = 0; if (local) @@ -308,7 +336,7 @@ static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign, return &ign->tunnels[prio][h]; } -static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign, +static inline struct ip_tunnel __rcu **ipgre_bucket(struct ipgre_net *ign, struct ip_tunnel *t) { return __ipgre_bucket(ign, &t->parms); @@ -316,23 +344,22 @@ static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign, static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t) { - struct ip_tunnel **tp = ipgre_bucket(ign, t); + struct ip_tunnel __rcu **tp = ipgre_bucket(ign, t); - spin_lock_bh(&ipgre_lock); - t->next = *tp; + rcu_assign_pointer(t->next, rtnl_dereference(*tp)); rcu_assign_pointer(*tp, t); - spin_unlock_bh(&ipgre_lock); } static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t) { - struct ip_tunnel **tp; - - for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) { - if (t == *tp) { - spin_lock_bh(&ipgre_lock); - *tp = t->next; - spin_unlock_bh(&ipgre_lock); + struct ip_tunnel __rcu **tp; + struct ip_tunnel *iter; + + for (tp = ipgre_bucket(ign, t); + (iter = rtnl_dereference(*tp)) != NULL; + tp = &iter->next) { + if (t == iter) { + rcu_assign_pointer(*tp, t->next); break; } } @@ -346,10 +373,13 @@ static struct ip_tunnel *ipgre_tunnel_find(struct net *net, __be32 local = parms->iph.saddr; __be32 key = parms->i_key; int link = parms->link; - struct ip_tunnel *t, **tp; + struct ip_tunnel *t; + struct ip_tunnel __rcu **tp; struct ipgre_net *ign = net_generic(net, ipgre_net_id); - for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next) + for (tp = __ipgre_bucket(ign, parms); + (t = rtnl_dereference(*tp)) != NULL; + tp = &t->next) if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr && key == t->parms.i_key && @@ -360,7 +390,7 @@ static struct ip_tunnel *ipgre_tunnel_find(struct net *net, return t; } -static struct ip_tunnel * ipgre_tunnel_locate(struct net *net, +static struct ip_tunnel *ipgre_tunnel_locate(struct net *net, struct ip_tunnel_parm *parms, int create) { struct ip_tunnel *t, *nt; @@ -582,7 +612,7 @@ static int ipgre_rcv(struct sk_buff *skb) if ((tunnel = ipgre_tunnel_lookup(skb->dev, iph->saddr, iph->daddr, key, gre_proto))) { - struct net_device_stats *stats = &tunnel->dev->stats; + struct pcpu_tstats *tstats; secpath_reset(skb); @@ -606,22 +636,22 @@ static int ipgre_rcv(struct sk_buff *skb) /* Looped back packet, drop it! */ if (skb_rtable(skb)->fl.iif == 0) goto drop; - stats->multicast++; + tunnel->dev->stats.multicast++; skb->pkt_type = PACKET_BROADCAST; } #endif if (((flags&GRE_CSUM) && csum) || (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) { - stats->rx_crc_errors++; - stats->rx_errors++; + tunnel->dev->stats.rx_crc_errors++; + tunnel->dev->stats.rx_errors++; goto drop; } if (tunnel->parms.i_flags&GRE_SEQ) { if (!(flags&GRE_SEQ) || (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) { - stats->rx_fifo_errors++; - stats->rx_errors++; + tunnel->dev->stats.rx_fifo_errors++; + tunnel->dev->stats.rx_errors++; goto drop; } tunnel->i_seqno = seqno + 1; @@ -630,8 +660,8 @@ static int ipgre_rcv(struct sk_buff *skb) /* Warning: All skb pointers will be invalidated! */ if (tunnel->dev->type == ARPHRD_ETHER) { if (!pskb_may_pull(skb, ETH_HLEN)) { - stats->rx_length_errors++; - stats->rx_errors++; + tunnel->dev->stats.rx_length_errors++; + tunnel->dev->stats.rx_errors++; goto drop; } @@ -640,14 +670,19 @@ static int ipgre_rcv(struct sk_buff *skb) skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); } - skb_tunnel_rx(skb, tunnel->dev); + tstats = this_cpu_ptr(tunnel->dev->tstats); + tstats->rx_packets++; + tstats->rx_bytes += skb->len; + + __skb_tunnel_rx(skb, tunnel->dev); skb_reset_network_header(skb); ipgre_ecn_decapsulate(iph, skb); netif_rx(skb); + rcu_read_unlock(); - return(0); + return 0; } icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); @@ -655,20 +690,19 @@ drop: rcu_read_unlock(); drop_nolock: kfree_skb(skb); - return(0); + return 0; } static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); - struct net_device_stats *stats = &dev->stats; - struct netdev_queue *txq = netdev_get_tx_queue(dev, 0); + struct pcpu_tstats *tstats; struct iphdr *old_iph = ip_hdr(skb); struct iphdr *tiph; u8 tos; __be16 df; struct rtable *rt; /* Route to the other host */ - struct net_device *tdev; /* Device to other host */ + struct net_device *tdev; /* Device to other host */ struct iphdr *iph; /* Our new IP header */ unsigned int max_headroom; /* The extra header space needed */ int gre_hlen; @@ -690,7 +724,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev /* NBMA tunnel */ if (skb_dst(skb) == NULL) { - stats->tx_fifo_errors++; + dev->stats.tx_fifo_errors++; goto tx_error; } @@ -699,7 +733,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev if ((dst = rt->rt_gateway) == 0) goto tx_error_icmp; } -#ifdef CONFIG_IPV6 +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) else if (skb->protocol == htons(ETH_P_IPV6)) { struct in6_addr *addr6; int addr_type; @@ -731,31 +765,39 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev tos = 0; if (skb->protocol == htons(ETH_P_IP)) tos = old_iph->tos; + else if (skb->protocol == htons(ETH_P_IPV6)) + tos = ipv6_get_dsfield((struct ipv6hdr *)old_iph); } { - struct flowi fl = { .oif = tunnel->parms.link, - .nl_u = { .ip4_u = - { .daddr = dst, - .saddr = tiph->saddr, - .tos = RT_TOS(tos) } }, - .proto = IPPROTO_GRE }; + struct flowi fl = { + .oif = tunnel->parms.link, + .nl_u = { + .ip4_u = { + .daddr = dst, + .saddr = tiph->saddr, + .tos = RT_TOS(tos) + } + }, + .proto = IPPROTO_GRE + } +; if (ip_route_output_key(dev_net(dev), &rt, &fl)) { - stats->tx_carrier_errors++; + dev->stats.tx_carrier_errors++; goto tx_error; } } - tdev = rt->u.dst.dev; + tdev = rt->dst.dev; if (tdev == dev) { ip_rt_put(rt); - stats->collisions++; + dev->stats.collisions++; goto tx_error; } df = tiph->frag_off; if (df) - mtu = dst_mtu(&rt->u.dst) - dev->hard_header_len - tunnel->hlen; + mtu = dst_mtu(&rt->dst) - dev->hard_header_len - tunnel->hlen; else mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; @@ -772,7 +814,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev goto tx_error; } } -#ifdef CONFIG_IPV6 +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) else if (skb->protocol == htons(ETH_P_IPV6)) { struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb); @@ -803,7 +845,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev tunnel->err_count = 0; } - max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->u.dst.header_len; + max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->dst.header_len; if (skb_headroom(skb) < max_headroom || skb_shared(skb)|| (skb_cloned(skb) && !skb_clone_writable(skb, 0))) { @@ -812,7 +854,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev dev->needed_headroom = max_headroom; if (!new_skb) { ip_rt_put(rt); - txq->tx_dropped++; + dev->stats.tx_dropped++; dev_kfree_skb(skb); return NETDEV_TX_OK; } @@ -830,7 +872,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | IPSKB_REROUTED); skb_dst_drop(skb); - skb_dst_set(skb, &rt->u.dst); + skb_dst_set(skb, &rt->dst); /* * Push down and install the IPIP header. @@ -848,12 +890,12 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev if ((iph->ttl = tiph->ttl) == 0) { if (skb->protocol == htons(ETH_P_IP)) iph->ttl = old_iph->ttl; -#ifdef CONFIG_IPV6 +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) else if (skb->protocol == htons(ETH_P_IPV6)) iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit; #endif else - iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT); + iph->ttl = dst_metric(&rt->dst, RTAX_HOPLIMIT); } ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags; @@ -879,15 +921,15 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev } nf_reset(skb); - - IPTUNNEL_XMIT(); + tstats = this_cpu_ptr(dev->tstats); + __IPTUNNEL_XMIT(tstats, &dev->stats); return NETDEV_TX_OK; tx_error_icmp: dst_link_failure(skb); tx_error: - stats->tx_errors++; + dev->stats.tx_errors++; dev_kfree_skb(skb); return NETDEV_TX_OK; } @@ -907,15 +949,21 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev) /* Guess output device to choose reasonable mtu and needed_headroom */ if (iph->daddr) { - struct flowi fl = { .oif = tunnel->parms.link, - .nl_u = { .ip4_u = - { .daddr = iph->daddr, - .saddr = iph->saddr, - .tos = RT_TOS(iph->tos) } }, - .proto = IPPROTO_GRE }; + struct flowi fl = { + .oif = tunnel->parms.link, + .nl_u = { + .ip4_u = { + .daddr = iph->daddr, + .saddr = iph->saddr, + .tos = RT_TOS(iph->tos) + } + }, + .proto = IPPROTO_GRE + }; struct rtable *rt; + if (!ip_route_output_key(dev_net(dev), &rt, &fl)) { - tdev = rt->u.dst.dev; + tdev = rt->dst.dev; ip_rt_put(rt); } @@ -1010,7 +1058,7 @@ ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) break; } } else { - unsigned nflags = 0; + unsigned int nflags = 0; t = netdev_priv(dev); @@ -1024,6 +1072,7 @@ ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) break; } ipgre_tunnel_unlink(ign, t); + synchronize_net(); t->parms.iph.saddr = p.iph.saddr; t->parms.iph.daddr = p.iph.daddr; t->parms.i_key = p.i_key; @@ -1123,7 +1172,7 @@ static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu) static int ipgre_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, - const void *daddr, const void *saddr, unsigned len) + const void *daddr, const void *saddr, unsigned int len) { struct ip_tunnel *t = netdev_priv(dev); struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen); @@ -1165,16 +1214,22 @@ static int ipgre_open(struct net_device *dev) struct ip_tunnel *t = netdev_priv(dev); if (ipv4_is_multicast(t->parms.iph.daddr)) { - struct flowi fl = { .oif = t->parms.link, - .nl_u = { .ip4_u = - { .daddr = t->parms.iph.daddr, - .saddr = t->parms.iph.saddr, - .tos = RT_TOS(t->parms.iph.tos) } }, - .proto = IPPROTO_GRE }; + struct flowi fl = { + .oif = t->parms.link, + .nl_u = { + .ip4_u = { + .daddr = t->parms.iph.daddr, + .saddr = t->parms.iph.saddr, + .tos = RT_TOS(t->parms.iph.tos) + } + }, + .proto = IPPROTO_GRE + }; struct rtable *rt; + if (ip_route_output_key(dev_net(dev), &rt, &fl)) return -EADDRNOTAVAIL; - dev = rt->u.dst.dev; + dev = rt->dst.dev; ip_rt_put(rt); if (__in_dev_get_rtnl(dev) == NULL) return -EADDRNOTAVAIL; @@ -1191,10 +1246,8 @@ static int ipgre_close(struct net_device *dev) if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) { struct in_device *in_dev; in_dev = inetdev_by_index(dev_net(dev), t->mlink); - if (in_dev) { + if (in_dev) ip_mc_dec_group(in_dev, t->parms.iph.daddr); - in_dev_put(in_dev); - } } return 0; } @@ -1211,12 +1264,19 @@ static const struct net_device_ops ipgre_netdev_ops = { .ndo_start_xmit = ipgre_tunnel_xmit, .ndo_do_ioctl = ipgre_tunnel_ioctl, .ndo_change_mtu = ipgre_tunnel_change_mtu, + .ndo_get_stats = ipgre_get_stats, }; +static void ipgre_dev_free(struct net_device *dev) +{ + free_percpu(dev->tstats); + free_netdev(dev); +} + static void ipgre_tunnel_setup(struct net_device *dev) { dev->netdev_ops = &ipgre_netdev_ops; - dev->destructor = free_netdev; + dev->destructor = ipgre_dev_free; dev->type = ARPHRD_IPGRE; dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4; @@ -1254,6 +1314,10 @@ static int ipgre_tunnel_init(struct net_device *dev) } else dev->header_ops = &ipgre_header_ops; + dev->tstats = alloc_percpu(struct pcpu_tstats); + if (!dev->tstats) + return -ENOMEM; + return 0; } @@ -1261,7 +1325,6 @@ static void ipgre_fb_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct iphdr *iph = &tunnel->parms.iph; - struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id); tunnel->dev = dev; strcpy(tunnel->parms.name, dev->name); @@ -1272,14 +1335,12 @@ static void ipgre_fb_tunnel_init(struct net_device *dev) tunnel->hlen = sizeof(struct iphdr) + 4; dev_hold(dev); - ign->tunnels_wc[0] = tunnel; } -static const struct net_protocol ipgre_protocol = { - .handler = ipgre_rcv, - .err_handler = ipgre_err, - .netns_ok = 1, +static const struct gre_protocol ipgre_protocol = { + .handler = ipgre_rcv, + .err_handler = ipgre_err, }; static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head) @@ -1289,11 +1350,13 @@ static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head) for (prio = 0; prio < 4; prio++) { int h; for (h = 0; h < HASH_SIZE; h++) { - struct ip_tunnel *t = ign->tunnels[prio][h]; + struct ip_tunnel *t; + + t = rtnl_dereference(ign->tunnels[prio][h]); while (t != NULL) { unregister_netdevice_queue(t->dev, head); - t = t->next; + t = rtnl_dereference(t->next); } } } @@ -1318,10 +1381,12 @@ static int __net_init ipgre_init_net(struct net *net) if ((err = register_netdev(ign->fb_tunnel_dev))) goto err_reg_dev; + rcu_assign_pointer(ign->tunnels_wc[0], + netdev_priv(ign->fb_tunnel_dev)); return 0; err_reg_dev: - free_netdev(ign->fb_tunnel_dev); + ipgre_dev_free(ign->fb_tunnel_dev); err_alloc_dev: return err; } @@ -1439,6 +1504,10 @@ static int ipgre_tap_init(struct net_device *dev) ipgre_tunnel_bind_dev(dev); + dev->tstats = alloc_percpu(struct pcpu_tstats); + if (!dev->tstats) + return -ENOMEM; + return 0; } @@ -1449,6 +1518,7 @@ static const struct net_device_ops ipgre_tap_netdev_ops = { .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, .ndo_change_mtu = ipgre_tunnel_change_mtu, + .ndo_get_stats = ipgre_get_stats, }; static void ipgre_tap_setup(struct net_device *dev) @@ -1457,7 +1527,7 @@ static void ipgre_tap_setup(struct net_device *dev) ether_setup(dev); dev->netdev_ops = &ipgre_tap_netdev_ops; - dev->destructor = free_netdev; + dev->destructor = ipgre_dev_free; dev->iflink = 0; dev->features |= NETIF_F_NETNS_LOCAL; @@ -1485,6 +1555,10 @@ static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nla if (!tb[IFLA_MTU]) dev->mtu = mtu; + /* Can use a lockless transmit, unless we generate output sequences */ + if (!(nt->parms.o_flags & GRE_SEQ)) + dev->features |= NETIF_F_LLTX; + err = register_netdevice(dev); if (err) goto out; @@ -1520,7 +1594,7 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[], t = nt; if (dev->type != ARPHRD_ETHER) { - unsigned nflags = 0; + unsigned int nflags = 0; if (ipv4_is_multicast(p.iph.daddr)) nflags = IFF_BROADCAST; @@ -1661,7 +1735,7 @@ static int __init ipgre_init(void) if (err < 0) return err; - err = inet_add_protocol(&ipgre_protocol, IPPROTO_GRE); + err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO); if (err < 0) { printk(KERN_INFO "ipgre init: can't add protocol\n"); goto add_proto_failed; @@ -1681,7 +1755,7 @@ out: tap_ops_failed: rtnl_link_unregister(&ipgre_link_ops); rtnl_link_failed: - inet_del_protocol(&ipgre_protocol, IPPROTO_GRE); + gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO); add_proto_failed: unregister_pernet_device(&ipgre_net_ops); goto out; @@ -1691,7 +1765,7 @@ static void __exit ipgre_fini(void) { rtnl_link_unregister(&ipgre_tap_ops); rtnl_link_unregister(&ipgre_link_ops); - if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) + if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0) printk(KERN_INFO "ipgre close: can't remove protocol\n"); unregister_pernet_device(&ipgre_net_ops); } diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index d930dc5e4d85..d859bcc26cb7 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -146,7 +146,7 @@ #include <linux/netlink.h> /* - * Process Router Attention IP option + * Process Router Attention IP option (RFC 2113) */ int ip_call_ra_chain(struct sk_buff *skb) { @@ -155,8 +155,7 @@ int ip_call_ra_chain(struct sk_buff *skb) struct sock *last = NULL; struct net_device *dev = skb->dev; - read_lock(&ip_ra_lock); - for (ra = ip_ra_chain; ra; ra = ra->next) { + for (ra = rcu_dereference(ip_ra_chain); ra; ra = rcu_dereference(ra->next)) { struct sock *sk = ra->sk; /* If socket is bound to an interface, only report @@ -167,10 +166,8 @@ int ip_call_ra_chain(struct sk_buff *skb) sk->sk_bound_dev_if == dev->ifindex) && net_eq(sock_net(sk), dev_net(dev))) { if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { - if (ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN)) { - read_unlock(&ip_ra_lock); + if (ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN)) return 1; - } } if (last) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); @@ -183,10 +180,8 @@ int ip_call_ra_chain(struct sk_buff *skb) if (last) { raw_rcv(last, skb); - read_unlock(&ip_ra_lock); return 1; } - read_unlock(&ip_ra_lock); return 0; } @@ -298,18 +293,16 @@ static inline int ip_rcv_options(struct sk_buff *skb) } if (unlikely(opt->srr)) { - struct in_device *in_dev = in_dev_get(dev); + struct in_device *in_dev = __in_dev_get_rcu(dev); + if (in_dev) { if (!IN_DEV_SOURCE_ROUTE(in_dev)) { if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) printk(KERN_INFO "source route option %pI4 -> %pI4\n", &iph->saddr, &iph->daddr); - in_dev_put(in_dev); goto drop; } - - in_dev_put(in_dev); } if (ip_options_rcv_srr(skb)) @@ -340,13 +333,16 @@ static int ip_rcv_finish(struct sk_buff *skb) else if (err == -ENETUNREACH) IP_INC_STATS_BH(dev_net(skb->dev), IPSTATS_MIB_INNOROUTES); + else if (err == -EXDEV) + NET_INC_STATS_BH(dev_net(skb->dev), + LINUX_MIB_IPRPFILTER); goto drop; } } #ifdef CONFIG_NET_CLS_ROUTE if (unlikely(skb_dst(skb)->tclassid)) { - struct ip_rt_acct *st = per_cpu_ptr(ip_rt_acct, smp_processor_id()); + struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct); u32 idx = skb_dst(skb)->tclassid; st[idx&0xFF].o_packets++; st[idx&0xFF].o_bytes += skb->len; @@ -360,10 +356,10 @@ static int ip_rcv_finish(struct sk_buff *skb) rt = skb_rtable(skb); if (rt->rt_type == RTN_MULTICAST) { - IP_UPD_PO_STATS_BH(dev_net(rt->u.dst.dev), IPSTATS_MIB_INMCAST, + IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INMCAST, skb->len); } else if (rt->rt_type == RTN_BROADCAST) - IP_UPD_PO_STATS_BH(dev_net(rt->u.dst.dev), IPSTATS_MIB_INBCAST, + IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INBCAST, skb->len); return dst_input(skb); diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index ba9836c488ed..1906fa35860c 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -466,7 +466,7 @@ error: } return -EINVAL; } - +EXPORT_SYMBOL(ip_options_compile); /* * Undo all the changes done by ip_options_compile(). @@ -646,3 +646,4 @@ int ip_options_rcv_srr(struct sk_buff *skb) } return 0; } +EXPORT_SYMBOL(ip_options_rcv_srr); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 041d41df1224..439d2a34ee44 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -89,6 +89,7 @@ __inline__ void ip_send_check(struct iphdr *iph) iph->check = 0; iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); } +EXPORT_SYMBOL(ip_send_check); int __ip_local_out(struct sk_buff *skb) { @@ -151,15 +152,15 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, iph->version = 4; iph->ihl = 5; iph->tos = inet->tos; - if (ip_dont_fragment(sk, &rt->u.dst)) + if (ip_dont_fragment(sk, &rt->dst)) iph->frag_off = htons(IP_DF); else iph->frag_off = 0; - iph->ttl = ip_select_ttl(inet, &rt->u.dst); + iph->ttl = ip_select_ttl(inet, &rt->dst); iph->daddr = rt->rt_dst; iph->saddr = rt->rt_src; iph->protocol = sk->sk_protocol; - ip_select_ident(iph, &rt->u.dst, sk); + ip_select_ident(iph, &rt->dst, sk); if (opt && opt->optlen) { iph->ihl += opt->optlen>>2; @@ -172,7 +173,6 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, /* Send it out. */ return ip_local_out(skb); } - EXPORT_SYMBOL_GPL(ip_build_and_send_pkt); static inline int ip_finish_output2(struct sk_buff *skb) @@ -240,7 +240,7 @@ int ip_mc_output(struct sk_buff *skb) { struct sock *sk = skb->sk; struct rtable *rt = skb_rtable(skb); - struct net_device *dev = rt->u.dst.dev; + struct net_device *dev = rt->dst.dev; /* * If the indicated interface is up and running, send the packet. @@ -359,9 +359,9 @@ int ip_queue_xmit(struct sk_buff *skb) if (ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0)) goto no_route; } - sk_setup_caps(sk, &rt->u.dst); + sk_setup_caps(sk, &rt->dst); } - skb_dst_set_noref(skb, &rt->u.dst); + skb_dst_set_noref(skb, &rt->dst); packet_routed: if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) @@ -372,11 +372,11 @@ packet_routed: skb_reset_network_header(skb); iph = ip_hdr(skb); *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff)); - if (ip_dont_fragment(sk, &rt->u.dst) && !skb->local_df) + if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df) iph->frag_off = htons(IP_DF); else iph->frag_off = 0; - iph->ttl = ip_select_ttl(inet, &rt->u.dst); + iph->ttl = ip_select_ttl(inet, &rt->dst); iph->protocol = sk->sk_protocol; iph->saddr = rt->rt_src; iph->daddr = rt->rt_dst; @@ -387,7 +387,7 @@ packet_routed: ip_options_build(skb, opt, inet->inet_daddr, rt, 0); } - ip_select_ident_more(iph, &rt->u.dst, sk, + ip_select_ident_more(iph, &rt->dst, sk, (skb_shinfo(skb)->gso_segs ?: 1) - 1); skb->priority = sk->sk_priority; @@ -403,6 +403,7 @@ no_route: kfree_skb(skb); return -EHOSTUNREACH; } +EXPORT_SYMBOL(ip_queue_xmit); static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) @@ -411,7 +412,7 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) to->priority = from->priority; to->protocol = from->protocol; skb_dst_drop(to); - skb_dst_set(to, dst_clone(skb_dst(from))); + skb_dst_copy(to, from); to->dev = from->dev; to->mark = from->mark; @@ -442,17 +443,16 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) { struct iphdr *iph; - int raw = 0; int ptr; struct net_device *dev; struct sk_buff *skb2; - unsigned int mtu, hlen, left, len, ll_rs, pad; + unsigned int mtu, hlen, left, len, ll_rs; int offset; __be16 not_last_frag; struct rtable *rt = skb_rtable(skb); int err = 0; - dev = rt->u.dst.dev; + dev = rt->dst.dev; /* * Point into the IP datagram header. @@ -473,7 +473,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) */ hlen = iph->ihl * 4; - mtu = dst_mtu(&rt->u.dst) - hlen; /* Size of data space */ + mtu = dst_mtu(&rt->dst) - hlen; /* Size of data space */ #ifdef CONFIG_BRIDGE_NETFILTER if (skb->nf_bridge) mtu -= nf_bridge_mtu_reduction(skb); @@ -487,10 +487,9 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) * LATER: this step can be merged to real generation of fragments, * we can switch to copy when see the first bad fragment. */ - if (skb_has_frags(skb)) { - struct sk_buff *frag; + if (skb_has_frag_list(skb)) { + struct sk_buff *frag, *frag2; int first_len = skb_pagelen(skb); - int truesizes = 0; if (first_len - hlen > mtu || ((first_len - hlen) & 7) || @@ -503,18 +502,18 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) if (frag->len > mtu || ((frag->len & 7) && frag->next) || skb_headroom(frag) < hlen) - goto slow_path; + goto slow_path_clean; /* Partially cloned skb? */ if (skb_shared(frag)) - goto slow_path; + goto slow_path_clean; BUG_ON(frag->sk); if (skb->sk) { frag->sk = skb->sk; frag->destructor = sock_wfree; } - truesizes += frag->truesize; + skb->truesize -= frag->truesize; } /* Everything is OK. Generate! */ @@ -524,7 +523,6 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) frag = skb_shinfo(skb)->frag_list; skb_frag_list_init(skb); skb->data_len = first_len - skb_headlen(skb); - skb->truesize -= truesizes; skb->len = first_len; iph->tot_len = htons(first_len); iph->frag_off = htons(IP_MF); @@ -576,18 +574,25 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) } IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); return err; + +slow_path_clean: + skb_walk_frags(skb, frag2) { + if (frag2 == frag) + break; + frag2->sk = NULL; + frag2->destructor = NULL; + skb->truesize += frag2->truesize; + } } slow_path: left = skb->len - hlen; /* Space per frame */ - ptr = raw + hlen; /* Where to start from */ + ptr = hlen; /* Where to start from */ /* for bridged IP traffic encapsulated inside f.e. a vlan header, * we need to make room for the encapsulating header */ - pad = nf_bridge_pad(skb); - ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, pad); - mtu -= pad; + ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb)); /* * Fragment the datagram. @@ -697,7 +702,6 @@ fail: IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); return err; } - EXPORT_SYMBOL(ip_fragment); int @@ -716,6 +720,7 @@ ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk } return 0; } +EXPORT_SYMBOL(ip_generic_getfrag); static inline __wsum csum_page(struct page *page, int offset, int copy) @@ -833,16 +838,15 @@ int ip_append_data(struct sock *sk, */ *rtp = NULL; inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ? - rt->u.dst.dev->mtu : - dst_mtu(rt->u.dst.path); - inet->cork.dst = &rt->u.dst; + rt->dst.dev->mtu : + dst_mtu(rt->dst.path); + inet->cork.dst = &rt->dst; inet->cork.length = 0; sk->sk_sndmsg_page = NULL; sk->sk_sndmsg_off = 0; - if ((exthdrlen = rt->u.dst.header_len) != 0) { - length += exthdrlen; - transhdrlen += exthdrlen; - } + exthdrlen = rt->dst.header_len; + length += exthdrlen; + transhdrlen += exthdrlen; } else { rt = (struct rtable *)inet->cork.dst; if (inet->cork.flags & IPCORK_OPT) @@ -852,7 +856,7 @@ int ip_append_data(struct sock *sk, exthdrlen = 0; mtu = inet->cork.fragsize; } - hh_len = LL_RESERVED_SPACE(rt->u.dst.dev); + hh_len = LL_RESERVED_SPACE(rt->dst.dev); fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; @@ -869,7 +873,7 @@ int ip_append_data(struct sock *sk, */ if (transhdrlen && length + fragheaderlen <= mtu && - rt->u.dst.dev->features & NETIF_F_V4_CSUM && + rt->dst.dev->features & NETIF_F_V4_CSUM && !exthdrlen) csummode = CHECKSUM_PARTIAL; @@ -878,7 +882,7 @@ int ip_append_data(struct sock *sk, inet->cork.length += length; if (((length > mtu) || (skb && skb_is_gso(skb))) && (sk->sk_protocol == IPPROTO_UDP) && - (rt->u.dst.dev->features & NETIF_F_UFO)) { + (rt->dst.dev->features & NETIF_F_UFO)) { err = ip_ufo_append_data(sk, getfrag, from, length, hh_len, fragheaderlen, transhdrlen, mtu, flags); @@ -926,19 +930,22 @@ alloc_new_skb: fraglen = datalen + fragheaderlen; if ((flags & MSG_MORE) && - !(rt->u.dst.dev->features&NETIF_F_SG)) + !(rt->dst.dev->features&NETIF_F_SG)) alloclen = mtu; else - alloclen = datalen + fragheaderlen; + alloclen = fraglen; /* The last fragment gets additional space at tail. * Note, with MSG_MORE we overallocate on fragments, * because we have no idea what fragment will be * the last. */ - if (datalen == length + fraggap) - alloclen += rt->u.dst.trailer_len; - + if (datalen == length + fraggap) { + alloclen += rt->dst.trailer_len; + /* make sure mtu is not reached */ + if (datalen > mtu - fragheaderlen - rt->dst.trailer_len) + datalen -= ALIGN(rt->dst.trailer_len, 8); + } if (transhdrlen) { skb = sock_alloc_send_skb(sk, alloclen + hh_len + 15, @@ -955,7 +962,7 @@ alloc_new_skb: else /* only the initial fragment is time stamped */ - ipc->shtx.flags = 0; + ipc->tx_flags = 0; } if (skb == NULL) goto error; @@ -966,7 +973,7 @@ alloc_new_skb: skb->ip_summed = csummode; skb->csum = 0; skb_reserve(skb, hh_len); - *skb_tx(skb) = ipc->shtx; + skb_shinfo(skb)->tx_flags = ipc->tx_flags; /* * Find where to start putting bytes. @@ -1010,7 +1017,7 @@ alloc_new_skb: if (copy > length) copy = length; - if (!(rt->u.dst.dev->features&NETIF_F_SG)) { + if (!(rt->dst.dev->features&NETIF_F_SG)) { unsigned int off; off = skb->len; @@ -1105,10 +1112,10 @@ ssize_t ip_append_page(struct sock *sk, struct page *page, if (inet->cork.flags & IPCORK_OPT) opt = inet->cork.opt; - if (!(rt->u.dst.dev->features&NETIF_F_SG)) + if (!(rt->dst.dev->features&NETIF_F_SG)) return -EOPNOTSUPP; - hh_len = LL_RESERVED_SPACE(rt->u.dst.dev); + hh_len = LL_RESERVED_SPACE(rt->dst.dev); mtu = inet->cork.fragsize; fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); @@ -1125,7 +1132,7 @@ ssize_t ip_append_page(struct sock *sk, struct page *page, inet->cork.length += size; if ((size + skb->len > mtu) && (sk->sk_protocol == IPPROTO_UDP) && - (rt->u.dst.dev->features & NETIF_F_UFO)) { + (rt->dst.dev->features & NETIF_F_UFO)) { skb_shinfo(skb)->gso_size = mtu - fragheaderlen; skb_shinfo(skb)->gso_type = SKB_GSO_UDP; } @@ -1277,8 +1284,8 @@ int ip_push_pending_frames(struct sock *sk) * If local_df is set too, we still allow to fragment this frame * locally. */ if (inet->pmtudisc >= IP_PMTUDISC_DO || - (skb->len <= dst_mtu(&rt->u.dst) && - ip_dont_fragment(sk, &rt->u.dst))) + (skb->len <= dst_mtu(&rt->dst) && + ip_dont_fragment(sk, &rt->dst))) df = htons(IP_DF); if (inet->cork.flags & IPCORK_OPT) @@ -1287,7 +1294,7 @@ int ip_push_pending_frames(struct sock *sk) if (rt->rt_type == RTN_MULTICAST) ttl = inet->mc_ttl; else - ttl = ip_select_ttl(inet, &rt->u.dst); + ttl = ip_select_ttl(inet, &rt->dst); iph = (struct iphdr *)skb->data; iph->version = 4; @@ -1298,7 +1305,7 @@ int ip_push_pending_frames(struct sock *sk) } iph->tos = inet->tos; iph->frag_off = df; - ip_select_ident(iph, &rt->u.dst, sk); + ip_select_ident(iph, &rt->dst, sk); iph->ttl = ttl; iph->protocol = sk->sk_protocol; iph->saddr = rt->rt_src; @@ -1311,7 +1318,7 @@ int ip_push_pending_frames(struct sock *sk) * on dst refcount */ inet->cork.dst = NULL; - skb_dst_set(skb, &rt->u.dst); + skb_dst_set(skb, &rt->dst); if (iph->protocol == IPPROTO_ICMP) icmp_out_count(net, ((struct icmphdr *) @@ -1386,7 +1393,7 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar daddr = ipc.addr = rt->rt_src; ipc.opt = NULL; - ipc.shtx.flags = 0; + ipc.tx_flags = 0; if (replyopts.opt.optlen) { ipc.opt = &replyopts.opt; @@ -1448,7 +1455,3 @@ void __init ip_init(void) igmp_mc_proc_init(); #endif } - -EXPORT_SYMBOL(ip_generic_getfrag); -EXPORT_SYMBOL(ip_queue_xmit); -EXPORT_SYMBOL(ip_send_check); diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index ce231780a2b1..3948c86e59ca 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -238,48 +238,68 @@ int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc) but receiver should be enough clever f.e. to forward mtrace requests, sent to multicast group to reach destination designated router. */ -struct ip_ra_chain *ip_ra_chain; -DEFINE_RWLOCK(ip_ra_lock); +struct ip_ra_chain __rcu *ip_ra_chain; +static DEFINE_SPINLOCK(ip_ra_lock); + + +static void ip_ra_destroy_rcu(struct rcu_head *head) +{ + struct ip_ra_chain *ra = container_of(head, struct ip_ra_chain, rcu); + + sock_put(ra->saved_sk); + kfree(ra); +} int ip_ra_control(struct sock *sk, unsigned char on, void (*destructor)(struct sock *)) { - struct ip_ra_chain *ra, *new_ra, **rap; + struct ip_ra_chain *ra, *new_ra; + struct ip_ra_chain __rcu **rap; if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num == IPPROTO_RAW) return -EINVAL; new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL; - write_lock_bh(&ip_ra_lock); - for (rap = &ip_ra_chain; (ra = *rap) != NULL; rap = &ra->next) { + spin_lock_bh(&ip_ra_lock); + for (rap = &ip_ra_chain; + (ra = rcu_dereference_protected(*rap, + lockdep_is_held(&ip_ra_lock))) != NULL; + rap = &ra->next) { if (ra->sk == sk) { if (on) { - write_unlock_bh(&ip_ra_lock); + spin_unlock_bh(&ip_ra_lock); kfree(new_ra); return -EADDRINUSE; } - *rap = ra->next; - write_unlock_bh(&ip_ra_lock); + /* dont let ip_call_ra_chain() use sk again */ + ra->sk = NULL; + rcu_assign_pointer(*rap, ra->next); + spin_unlock_bh(&ip_ra_lock); if (ra->destructor) ra->destructor(sk); - sock_put(sk); - kfree(ra); + /* + * Delay sock_put(sk) and kfree(ra) after one rcu grace + * period. This guarantee ip_call_ra_chain() dont need + * to mess with socket refcounts. + */ + ra->saved_sk = sk; + call_rcu(&ra->rcu, ip_ra_destroy_rcu); return 0; } } if (new_ra == NULL) { - write_unlock_bh(&ip_ra_lock); + spin_unlock_bh(&ip_ra_lock); return -ENOBUFS; } new_ra->sk = sk; new_ra->destructor = destructor; new_ra->next = ra; - *rap = new_ra; + rcu_assign_pointer(*rap, new_ra); sock_hold(sk); - write_unlock_bh(&ip_ra_lock); + spin_unlock_bh(&ip_ra_lock); return 0; } @@ -449,7 +469,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, (1<<IP_MTU_DISCOVER) | (1<<IP_RECVERR) | (1<<IP_ROUTER_ALERT) | (1<<IP_FREEBIND) | (1<<IP_PASSSEC) | (1<<IP_TRANSPARENT) | - (1<<IP_MINTTL))) || + (1<<IP_MINTTL) | (1<<IP_NODEFRAG))) || optname == IP_MULTICAST_TTL || optname == IP_MULTICAST_ALL || optname == IP_MULTICAST_LOOP || @@ -572,6 +592,13 @@ static int do_ip_setsockopt(struct sock *sk, int level, } inet->hdrincl = val ? 1 : 0; break; + case IP_NODEFRAG: + if (sk->sk_type != SOCK_RAW) { + err = -ENOPROTOOPT; + break; + } + inet->nodefrag = val ? 1 : 0; + break; case IP_MTU_DISCOVER: if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_PROBE) goto e_inval; @@ -1106,6 +1133,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, case IP_HDRINCL: val = inet->hdrincl; break; + case IP_NODEFRAG: + val = inet->nodefrag; + break; case IP_MTU_DISCOVER: val = inet->pmtudisc; break; diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index b9d84e800cf4..3a6e1ec5e9ae 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -665,6 +665,13 @@ ic_dhcp_init_options(u8 *options) memcpy(e, ic_req_params, sizeof(ic_req_params)); e += sizeof(ic_req_params); + if (ic_host_name_set) { + *e++ = 12; /* host-name */ + len = strlen(utsname()->nodename); + *e++ = len; + memcpy(e, utsname()->nodename, len); + e += len; + } if (*vendor_class_identifier) { printk(KERN_INFO "DHCP: sending class identifier \"%s\"\n", vendor_class_identifier); diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 7fd636711037..cd300aaee78f 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -122,31 +122,59 @@ static int ipip_net_id __read_mostly; struct ipip_net { - struct ip_tunnel *tunnels_r_l[HASH_SIZE]; - struct ip_tunnel *tunnels_r[HASH_SIZE]; - struct ip_tunnel *tunnels_l[HASH_SIZE]; - struct ip_tunnel *tunnels_wc[1]; - struct ip_tunnel **tunnels[4]; + struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE]; + struct ip_tunnel __rcu *tunnels_r[HASH_SIZE]; + struct ip_tunnel __rcu *tunnels_l[HASH_SIZE]; + struct ip_tunnel __rcu *tunnels_wc[1]; + struct ip_tunnel __rcu **tunnels[4]; struct net_device *fb_tunnel_dev; }; -static void ipip_tunnel_init(struct net_device *dev); +static int ipip_tunnel_init(struct net_device *dev); static void ipip_tunnel_setup(struct net_device *dev); +static void ipip_dev_free(struct net_device *dev); /* - * Locking : hash tables are protected by RCU and a spinlock + * Locking : hash tables are protected by RCU and RTNL */ -static DEFINE_SPINLOCK(ipip_lock); #define for_each_ip_tunnel_rcu(start) \ for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) +/* often modified stats are per cpu, other are shared (netdev->stats) */ +struct pcpu_tstats { + unsigned long rx_packets; + unsigned long rx_bytes; + unsigned long tx_packets; + unsigned long tx_bytes; +}; + +static struct net_device_stats *ipip_get_stats(struct net_device *dev) +{ + struct pcpu_tstats sum = { 0 }; + int i; + + for_each_possible_cpu(i) { + const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i); + + sum.rx_packets += tstats->rx_packets; + sum.rx_bytes += tstats->rx_bytes; + sum.tx_packets += tstats->tx_packets; + sum.tx_bytes += tstats->tx_bytes; + } + dev->stats.rx_packets = sum.rx_packets; + dev->stats.rx_bytes = sum.rx_bytes; + dev->stats.tx_packets = sum.tx_packets; + dev->stats.tx_bytes = sum.tx_bytes; + return &dev->stats; +} + static struct ip_tunnel * ipip_tunnel_lookup(struct net *net, __be32 remote, __be32 local) { - unsigned h0 = HASH(remote); - unsigned h1 = HASH(local); + unsigned int h0 = HASH(remote); + unsigned int h1 = HASH(local); struct ip_tunnel *t; struct ipip_net *ipn = net_generic(net, ipip_net_id); @@ -169,12 +197,12 @@ static struct ip_tunnel * ipip_tunnel_lookup(struct net *net, return NULL; } -static struct ip_tunnel **__ipip_bucket(struct ipip_net *ipn, +static struct ip_tunnel __rcu **__ipip_bucket(struct ipip_net *ipn, struct ip_tunnel_parm *parms) { __be32 remote = parms->iph.daddr; __be32 local = parms->iph.saddr; - unsigned h = 0; + unsigned int h = 0; int prio = 0; if (remote) { @@ -188,7 +216,7 @@ static struct ip_tunnel **__ipip_bucket(struct ipip_net *ipn, return &ipn->tunnels[prio][h]; } -static inline struct ip_tunnel **ipip_bucket(struct ipip_net *ipn, +static inline struct ip_tunnel __rcu **ipip_bucket(struct ipip_net *ipn, struct ip_tunnel *t) { return __ipip_bucket(ipn, &t->parms); @@ -196,13 +224,14 @@ static inline struct ip_tunnel **ipip_bucket(struct ipip_net *ipn, static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t) { - struct ip_tunnel **tp; - - for (tp = ipip_bucket(ipn, t); *tp; tp = &(*tp)->next) { - if (t == *tp) { - spin_lock_bh(&ipip_lock); - *tp = t->next; - spin_unlock_bh(&ipip_lock); + struct ip_tunnel __rcu **tp; + struct ip_tunnel *iter; + + for (tp = ipip_bucket(ipn, t); + (iter = rtnl_dereference(*tp)) != NULL; + tp = &iter->next) { + if (t == iter) { + rcu_assign_pointer(*tp, t->next); break; } } @@ -210,12 +239,10 @@ static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t) static void ipip_tunnel_link(struct ipip_net *ipn, struct ip_tunnel *t) { - struct ip_tunnel **tp = ipip_bucket(ipn, t); + struct ip_tunnel __rcu **tp = ipip_bucket(ipn, t); - spin_lock_bh(&ipip_lock); - t->next = *tp; + rcu_assign_pointer(t->next, rtnl_dereference(*tp)); rcu_assign_pointer(*tp, t); - spin_unlock_bh(&ipip_lock); } static struct ip_tunnel * ipip_tunnel_locate(struct net *net, @@ -223,12 +250,15 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net, { __be32 remote = parms->iph.daddr; __be32 local = parms->iph.saddr; - struct ip_tunnel *t, **tp, *nt; + struct ip_tunnel *t, *nt; + struct ip_tunnel __rcu **tp; struct net_device *dev; char name[IFNAMSIZ]; struct ipip_net *ipn = net_generic(net, ipip_net_id); - for (tp = __ipip_bucket(ipn, parms); (t = *tp) != NULL; tp = &t->next) { + for (tp = __ipip_bucket(ipn, parms); + (t = rtnl_dereference(*tp)) != NULL; + tp = &t->next) { if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) return t; } @@ -238,7 +268,7 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net, if (parms->name[0]) strlcpy(name, parms->name, IFNAMSIZ); else - sprintf(name, "tunl%%d"); + strcpy(name, "tunl%d"); dev = alloc_netdev(sizeof(*t), name, ipip_tunnel_setup); if (dev == NULL) @@ -254,7 +284,8 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net, nt = netdev_priv(dev); nt->parms = *parms; - ipip_tunnel_init(dev); + if (ipip_tunnel_init(dev) < 0) + goto failed_free; if (register_netdevice(dev) < 0) goto failed_free; @@ -264,20 +295,19 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net, return nt; failed_free: - free_netdev(dev); + ipip_dev_free(dev); return NULL; } +/* called with RTNL */ static void ipip_tunnel_uninit(struct net_device *dev) { struct net *net = dev_net(dev); struct ipip_net *ipn = net_generic(net, ipip_net_id); - if (dev == ipn->fb_tunnel_dev) { - spin_lock_bh(&ipip_lock); - ipn->tunnels_wc[0] = NULL; - spin_unlock_bh(&ipip_lock); - } else + if (dev == ipn->fb_tunnel_dev) + rcu_assign_pointer(ipn->tunnels_wc[0], NULL); + else ipip_tunnel_unlink(ipn, netdev_priv(dev)); dev_put(dev); } @@ -359,8 +389,10 @@ static int ipip_rcv(struct sk_buff *skb) const struct iphdr *iph = ip_hdr(skb); rcu_read_lock(); - if ((tunnel = ipip_tunnel_lookup(dev_net(skb->dev), - iph->saddr, iph->daddr)) != NULL) { + tunnel = ipip_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr); + if (tunnel != NULL) { + struct pcpu_tstats *tstats; + if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { rcu_read_unlock(); kfree_skb(skb); @@ -374,10 +406,16 @@ static int ipip_rcv(struct sk_buff *skb) skb->protocol = htons(ETH_P_IP); skb->pkt_type = PACKET_HOST; - skb_tunnel_rx(skb, tunnel->dev); + tstats = this_cpu_ptr(tunnel->dev->tstats); + tstats->rx_packets++; + tstats->rx_bytes += skb->len; + + __skb_tunnel_rx(skb, tunnel->dev); ipip_ecn_decapsulate(iph, skb); + netif_rx(skb); + rcu_read_unlock(); return 0; } @@ -394,13 +432,12 @@ static int ipip_rcv(struct sk_buff *skb) static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); - struct net_device_stats *stats = &dev->stats; - struct netdev_queue *txq = netdev_get_tx_queue(dev, 0); + struct pcpu_tstats *tstats; struct iphdr *tiph = &tunnel->parms.iph; u8 tos = tunnel->parms.iph.tos; __be16 df = tiph->frag_off; struct rtable *rt; /* Route to the other host */ - struct net_device *tdev; /* Device to other host */ + struct net_device *tdev; /* Device to other host */ struct iphdr *old_iph = ip_hdr(skb); struct iphdr *iph; /* Our new IP header */ unsigned int max_headroom; /* The extra header space needed */ @@ -410,13 +447,13 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) if (skb->protocol != htons(ETH_P_IP)) goto tx_error; - if (tos&1) + if (tos & 1) tos = old_iph->tos; if (!dst) { /* NBMA tunnel */ if ((rt = skb_rtable(skb)) == NULL) { - stats->tx_fifo_errors++; + dev->stats.tx_fifo_errors++; goto tx_error; } if ((dst = rt->rt_gateway) == 0) @@ -424,32 +461,38 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) } { - struct flowi fl = { .oif = tunnel->parms.link, - .nl_u = { .ip4_u = - { .daddr = dst, - .saddr = tiph->saddr, - .tos = RT_TOS(tos) } }, - .proto = IPPROTO_IPIP }; + struct flowi fl = { + .oif = tunnel->parms.link, + .nl_u = { + .ip4_u = { + .daddr = dst, + .saddr = tiph->saddr, + .tos = RT_TOS(tos) + } + }, + .proto = IPPROTO_IPIP + }; + if (ip_route_output_key(dev_net(dev), &rt, &fl)) { - stats->tx_carrier_errors++; + dev->stats.tx_carrier_errors++; goto tx_error_icmp; } } - tdev = rt->u.dst.dev; + tdev = rt->dst.dev; if (tdev == dev) { ip_rt_put(rt); - stats->collisions++; + dev->stats.collisions++; goto tx_error; } df |= old_iph->frag_off & htons(IP_DF); if (df) { - mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr); + mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); if (mtu < 68) { - stats->collisions++; + dev->stats.collisions++; ip_rt_put(rt); goto tx_error; } @@ -485,7 +528,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); if (!new_skb) { ip_rt_put(rt); - txq->tx_dropped++; + dev->stats.tx_dropped++; dev_kfree_skb(skb); return NETDEV_TX_OK; } @@ -503,7 +546,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | IPSKB_REROUTED); skb_dst_drop(skb); - skb_dst_set(skb, &rt->u.dst); + skb_dst_set(skb, &rt->dst); /* * Push down and install the IPIP header. @@ -522,14 +565,14 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) iph->ttl = old_iph->ttl; nf_reset(skb); - - IPTUNNEL_XMIT(); + tstats = this_cpu_ptr(dev->tstats); + __IPTUNNEL_XMIT(tstats, &dev->stats); return NETDEV_TX_OK; tx_error_icmp: dst_link_failure(skb); tx_error: - stats->tx_errors++; + dev->stats.tx_errors++; dev_kfree_skb(skb); return NETDEV_TX_OK; } @@ -544,15 +587,21 @@ static void ipip_tunnel_bind_dev(struct net_device *dev) iph = &tunnel->parms.iph; if (iph->daddr) { - struct flowi fl = { .oif = tunnel->parms.link, - .nl_u = { .ip4_u = - { .daddr = iph->daddr, - .saddr = iph->saddr, - .tos = RT_TOS(iph->tos) } }, - .proto = IPPROTO_IPIP }; + struct flowi fl = { + .oif = tunnel->parms.link, + .nl_u = { + .ip4_u = { + .daddr = iph->daddr, + .saddr = iph->saddr, + .tos = RT_TOS(iph->tos) + } + }, + .proto = IPPROTO_IPIP + }; struct rtable *rt; + if (!ip_route_output_key(dev_net(dev), &rt, &fl)) { - tdev = rt->u.dst.dev; + tdev = rt->dst.dev; ip_rt_put(rt); } dev->flags |= IFF_POINTOPOINT; @@ -627,6 +676,7 @@ ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) } t = netdev_priv(dev); ipip_tunnel_unlink(ipn, t); + synchronize_net(); t->parms.iph.saddr = p.iph.saddr; t->parms.iph.daddr = p.iph.daddr; memcpy(dev->dev_addr, &p.iph.saddr, 4); @@ -696,13 +746,19 @@ static const struct net_device_ops ipip_netdev_ops = { .ndo_start_xmit = ipip_tunnel_xmit, .ndo_do_ioctl = ipip_tunnel_ioctl, .ndo_change_mtu = ipip_tunnel_change_mtu, - + .ndo_get_stats = ipip_get_stats, }; +static void ipip_dev_free(struct net_device *dev) +{ + free_percpu(dev->tstats); + free_netdev(dev); +} + static void ipip_tunnel_setup(struct net_device *dev) { dev->netdev_ops = &ipip_netdev_ops; - dev->destructor = free_netdev; + dev->destructor = ipip_dev_free; dev->type = ARPHRD_TUNNEL; dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr); @@ -711,10 +767,11 @@ static void ipip_tunnel_setup(struct net_device *dev) dev->iflink = 0; dev->addr_len = 4; dev->features |= NETIF_F_NETNS_LOCAL; + dev->features |= NETIF_F_LLTX; dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; } -static void ipip_tunnel_init(struct net_device *dev) +static int ipip_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); @@ -725,9 +782,15 @@ static void ipip_tunnel_init(struct net_device *dev) memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); ipip_tunnel_bind_dev(dev); + + dev->tstats = alloc_percpu(struct pcpu_tstats); + if (!dev->tstats) + return -ENOMEM; + + return 0; } -static void __net_init ipip_fb_tunnel_init(struct net_device *dev) +static int __net_init ipip_fb_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct iphdr *iph = &tunnel->parms.iph; @@ -740,11 +803,16 @@ static void __net_init ipip_fb_tunnel_init(struct net_device *dev) iph->protocol = IPPROTO_IPIP; iph->ihl = 5; + dev->tstats = alloc_percpu(struct pcpu_tstats); + if (!dev->tstats) + return -ENOMEM; + dev_hold(dev); - ipn->tunnels_wc[0] = tunnel; + rcu_assign_pointer(ipn->tunnels_wc[0], tunnel); + return 0; } -static struct xfrm_tunnel ipip_handler = { +static struct xfrm_tunnel ipip_handler __read_mostly = { .handler = ipip_rcv, .err_handler = ipip_err, .priority = 1, @@ -760,11 +828,12 @@ static void ipip_destroy_tunnels(struct ipip_net *ipn, struct list_head *head) for (prio = 1; prio < 4; prio++) { int h; for (h = 0; h < HASH_SIZE; h++) { - struct ip_tunnel *t = ipn->tunnels[prio][h]; + struct ip_tunnel *t; + t = rtnl_dereference(ipn->tunnels[prio][h]); while (t != NULL) { unregister_netdevice_queue(t->dev, head); - t = t->next; + t = rtnl_dereference(t->next); } } } @@ -789,7 +858,9 @@ static int __net_init ipip_init_net(struct net *net) } dev_net_set(ipn->fb_tunnel_dev, net); - ipip_fb_tunnel_init(ipn->fb_tunnel_dev); + err = ipip_fb_tunnel_init(ipn->fb_tunnel_dev); + if (err) + goto err_reg_dev; if ((err = register_netdev(ipn->fb_tunnel_dev))) goto err_reg_dev; @@ -797,7 +868,7 @@ static int __net_init ipip_init_net(struct net *net) return 0; err_reg_dev: - free_netdev(ipn->fb_tunnel_dev); + ipip_dev_free(ipn->fb_tunnel_dev); err_alloc_dev: /* nothing */ return err; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 7f6273506eea..86dd5691af46 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -75,7 +75,7 @@ struct mr_table { struct net *net; #endif u32 id; - struct sock *mroute_sk; + struct sock __rcu *mroute_sk; struct timer_list ipmr_expire_timer; struct list_head mfc_unres_queue; struct list_head mfc_cache_array[MFC_LINES]; @@ -98,7 +98,7 @@ struct ipmr_result { }; /* Big lock, protecting vif table, mrt cache and mroute socket state. - Note that the changes are semaphored via rtnl_lock. + * Note that the changes are semaphored via rtnl_lock. */ static DEFINE_RWLOCK(mrt_lock); @@ -113,11 +113,11 @@ static DEFINE_RWLOCK(mrt_lock); static DEFINE_SPINLOCK(mfc_unres_lock); /* We return to original Alan's scheme. Hash table of resolved - entries is changed only in process context and protected - with weak lock mrt_lock. Queue of unresolved entries is protected - with strong spinlock mfc_unres_lock. - - In this case data path is free of exclusive locks at all. + * entries is changed only in process context and protected + * with weak lock mrt_lock. Queue of unresolved entries is protected + * with strong spinlock mfc_unres_lock. + * + * In this case data path is free of exclusive locks at all. */ static struct kmem_cache *mrt_cachep __read_mostly; @@ -396,9 +396,9 @@ struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v) set_fs(KERNEL_DS); err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL); set_fs(oldfs); - } else + } else { err = -EOPNOTSUPP; - + } dev = NULL; if (err == 0 && @@ -495,7 +495,8 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt) dev->iflink = 0; rcu_read_lock(); - if ((in_dev = __in_dev_get_rcu(dev)) == NULL) { + in_dev = __in_dev_get_rcu(dev); + if (!in_dev) { rcu_read_unlock(); goto failure; } @@ -552,9 +553,10 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify, mrt->mroute_reg_vif_num = -1; #endif - if (vifi+1 == mrt->maxvif) { + if (vifi + 1 == mrt->maxvif) { int tmp; - for (tmp=vifi-1; tmp>=0; tmp--) { + + for (tmp = vifi - 1; tmp >= 0; tmp--) { if (VIF_EXISTS(mrt, tmp)) break; } @@ -565,25 +567,33 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify, dev_set_allmulti(dev, -1); - if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) { + in_dev = __in_dev_get_rtnl(dev); + if (in_dev) { IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--; ip_rt_multicast_event(in_dev); } - if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER) && !notify) + if (v->flags & (VIFF_TUNNEL | VIFF_REGISTER) && !notify) unregister_netdevice_queue(dev, head); dev_put(dev); return 0; } -static inline void ipmr_cache_free(struct mfc_cache *c) +static void ipmr_cache_free_rcu(struct rcu_head *head) { + struct mfc_cache *c = container_of(head, struct mfc_cache, rcu); + kmem_cache_free(mrt_cachep, c); } +static inline void ipmr_cache_free(struct mfc_cache *c) +{ + call_rcu(&c->rcu, ipmr_cache_free_rcu); +} + /* Destroy an unresolved cache entry, killing queued skbs - and reporting error to netlink readers. + * and reporting error to netlink readers. */ static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c) @@ -605,8 +615,9 @@ static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c) memset(&e->msg, 0, sizeof(e->msg)); rtnl_unicast(skb, net, NETLINK_CB(skb).pid); - } else + } else { kfree_skb(skb); + } } ipmr_cache_free(c); @@ -724,13 +735,13 @@ static int vif_add(struct net *net, struct mr_table *mrt, case 0: if (vifc->vifc_flags == VIFF_USE_IFINDEX) { dev = dev_get_by_index(net, vifc->vifc_lcl_ifindex); - if (dev && dev->ip_ptr == NULL) { + if (dev && __in_dev_get_rtnl(dev) == NULL) { dev_put(dev); return -EADDRNOTAVAIL; } - } else + } else { dev = ip_dev_find(net, vifc->vifc_lcl_addr.s_addr); - + } if (!dev) return -EADDRNOTAVAIL; err = dev_set_allmulti(dev, 1); @@ -743,16 +754,16 @@ static int vif_add(struct net *net, struct mr_table *mrt, return -EINVAL; } - if ((in_dev = __in_dev_get_rtnl(dev)) == NULL) { + in_dev = __in_dev_get_rtnl(dev); + if (!in_dev) { dev_put(dev); return -EADDRNOTAVAIL; } IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++; ip_rt_multicast_event(in_dev); - /* - * Fill in the VIF structures - */ + /* Fill in the VIF structures */ + v->rate_limit = vifc->vifc_rate_limit; v->local = vifc->vifc_lcl_addr.s_addr; v->remote = vifc->vifc_rmt_addr.s_addr; @@ -765,14 +776,14 @@ static int vif_add(struct net *net, struct mr_table *mrt, v->pkt_in = 0; v->pkt_out = 0; v->link = dev->ifindex; - if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER)) + if (v->flags & (VIFF_TUNNEL | VIFF_REGISTER)) v->link = dev->iflink; /* And finish update writing critical data */ write_lock_bh(&mrt_lock); v->dev = dev; #ifdef CONFIG_IP_PIMSM - if (v->flags&VIFF_REGISTER) + if (v->flags & VIFF_REGISTER) mrt->mroute_reg_vif_num = vifi; #endif if (vifi+1 > mrt->maxvif) @@ -781,6 +792,7 @@ static int vif_add(struct net *net, struct mr_table *mrt, return 0; } +/* called with rcu_read_lock() */ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt, __be32 origin, __be32 mcastgrp) @@ -788,7 +800,7 @@ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt, int line = MFC_HASH(mcastgrp, origin); struct mfc_cache *c; - list_for_each_entry(c, &mrt->mfc_cache_array[line], list) { + list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list) { if (c->mfc_origin == origin && c->mfc_mcastgrp == mcastgrp) return c; } @@ -801,19 +813,20 @@ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt, static struct mfc_cache *ipmr_cache_alloc(void) { struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL); - if (c == NULL) - return NULL; - c->mfc_un.res.minvif = MAXVIFS; + + if (c) + c->mfc_un.res.minvif = MAXVIFS; return c; } static struct mfc_cache *ipmr_cache_alloc_unres(void) { struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC); - if (c == NULL) - return NULL; - skb_queue_head_init(&c->mfc_un.unres.unresolved); - c->mfc_un.unres.expires = jiffies + 10*HZ; + + if (c) { + skb_queue_head_init(&c->mfc_un.unres.unresolved); + c->mfc_un.unres.expires = jiffies + 10*HZ; + } return c; } @@ -827,17 +840,15 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt, struct sk_buff *skb; struct nlmsgerr *e; - /* - * Play the pending entries through our router - */ + /* Play the pending entries through our router */ while ((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) { if (ip_hdr(skb)->version == 0) { struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr)); if (__ipmr_fill_mroute(mrt, skb, c, NLMSG_DATA(nlh)) > 0) { - nlh->nlmsg_len = (skb_tail_pointer(skb) - - (u8 *)nlh); + nlh->nlmsg_len = skb_tail_pointer(skb) - + (u8 *)nlh; } else { nlh->nlmsg_type = NLMSG_ERROR; nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr)); @@ -848,8 +859,9 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt, } rtnl_unicast(skb, net, NETLINK_CB(skb).pid); - } else + } else { ip_mr_forward(net, mrt, skb, c, 0); + } } } @@ -867,6 +879,7 @@ static int ipmr_cache_report(struct mr_table *mrt, const int ihl = ip_hdrlen(pkt); struct igmphdr *igmp; struct igmpmsg *msg; + struct sock *mroute_sk; int ret; #ifdef CONFIG_IP_PIMSM @@ -882,9 +895,9 @@ static int ipmr_cache_report(struct mr_table *mrt, #ifdef CONFIG_IP_PIMSM if (assert == IGMPMSG_WHOLEPKT) { /* Ugly, but we have no choice with this interface. - Duplicate old header, fix ihl, length etc. - And all this only to mangle msg->im_msgtype and - to set msg->im_mbz to "mbz" :-) + * Duplicate old header, fix ihl, length etc. + * And all this only to mangle msg->im_msgtype and + * to set msg->im_mbz to "mbz" :-) */ skb_push(skb, sizeof(struct iphdr)); skb_reset_network_header(skb); @@ -901,39 +914,38 @@ static int ipmr_cache_report(struct mr_table *mrt, #endif { - /* - * Copy the IP header - */ + /* Copy the IP header */ skb->network_header = skb->tail; skb_put(skb, ihl); skb_copy_to_linear_data(skb, pkt->data, ihl); - ip_hdr(skb)->protocol = 0; /* Flag to the kernel this is a route add */ + ip_hdr(skb)->protocol = 0; /* Flag to the kernel this is a route add */ msg = (struct igmpmsg *)skb_network_header(skb); msg->im_vif = vifi; skb_dst_set(skb, dst_clone(skb_dst(pkt))); - /* - * Add our header - */ + /* Add our header */ - igmp=(struct igmphdr *)skb_put(skb, sizeof(struct igmphdr)); + igmp = (struct igmphdr *)skb_put(skb, sizeof(struct igmphdr)); igmp->type = msg->im_msgtype = assert; - igmp->code = 0; - ip_hdr(skb)->tot_len = htons(skb->len); /* Fix the length */ + igmp->code = 0; + ip_hdr(skb)->tot_len = htons(skb->len); /* Fix the length */ skb->transport_header = skb->network_header; } - if (mrt->mroute_sk == NULL) { + rcu_read_lock(); + mroute_sk = rcu_dereference(mrt->mroute_sk); + if (mroute_sk == NULL) { + rcu_read_unlock(); kfree_skb(skb); return -EINVAL; } - /* - * Deliver to mrouted - */ - ret = sock_queue_rcv_skb(mrt->mroute_sk, skb); + /* Deliver to mrouted */ + + ret = sock_queue_rcv_skb(mroute_sk, skb); + rcu_read_unlock(); if (ret < 0) { if (net_ratelimit()) printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n"); @@ -965,9 +977,7 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb) } if (!found) { - /* - * Create a new entry if allowable - */ + /* Create a new entry if allowable */ if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 || (c = ipmr_cache_alloc_unres()) == NULL) { @@ -977,16 +987,14 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb) return -ENOBUFS; } - /* - * Fill in the new cache entry - */ + /* Fill in the new cache entry */ + c->mfc_parent = -1; c->mfc_origin = iph->saddr; c->mfc_mcastgrp = iph->daddr; - /* - * Reflect first query at mrouted. - */ + /* Reflect first query at mrouted. */ + err = ipmr_cache_report(mrt, skb, vifi, IGMPMSG_NOCACHE); if (err < 0) { /* If the report failed throw the cache entry @@ -1006,10 +1014,9 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb) mod_timer(&mrt->ipmr_expire_timer, c->mfc_un.unres.expires); } - /* - * See if we can append the packet - */ - if (c->mfc_un.unres.unresolved.qlen>3) { + /* See if we can append the packet */ + + if (c->mfc_un.unres.unresolved.qlen > 3) { kfree_skb(skb); err = -ENOBUFS; } else { @@ -1035,9 +1042,7 @@ static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc) list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[line], list) { if (c->mfc_origin == mfc->mfcc_origin.s_addr && c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) { - write_lock_bh(&mrt_lock); - list_del(&c->list); - write_unlock_bh(&mrt_lock); + list_del_rcu(&c->list); ipmr_cache_free(c); return 0; @@ -1090,9 +1095,7 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, if (!mrtsock) c->mfc_flags |= MFC_STATIC; - write_lock_bh(&mrt_lock); - list_add(&c->list, &mrt->mfc_cache_array[line]); - write_unlock_bh(&mrt_lock); + list_add_rcu(&c->list, &mrt->mfc_cache_array[line]); /* * Check to see if we resolved a queued list. If so we @@ -1130,26 +1133,21 @@ static void mroute_clean_tables(struct mr_table *mrt) LIST_HEAD(list); struct mfc_cache *c, *next; - /* - * Shut down all active vif entries - */ + /* Shut down all active vif entries */ + for (i = 0; i < mrt->maxvif; i++) { - if (!(mrt->vif_table[i].flags&VIFF_STATIC)) + if (!(mrt->vif_table[i].flags & VIFF_STATIC)) vif_delete(mrt, i, 0, &list); } unregister_netdevice_many(&list); - /* - * Wipe the cache - */ + /* Wipe the cache */ + for (i = 0; i < MFC_LINES; i++) { list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[i], list) { - if (c->mfc_flags&MFC_STATIC) + if (c->mfc_flags & MFC_STATIC) continue; - write_lock_bh(&mrt_lock); - list_del(&c->list); - write_unlock_bh(&mrt_lock); - + list_del_rcu(&c->list); ipmr_cache_free(c); } } @@ -1164,6 +1162,9 @@ static void mroute_clean_tables(struct mr_table *mrt) } } +/* called from ip_ra_control(), before an RCU grace period, + * we dont need to call synchronize_rcu() here + */ static void mrtsock_destruct(struct sock *sk) { struct net *net = sock_net(sk); @@ -1171,13 +1172,9 @@ static void mrtsock_destruct(struct sock *sk) rtnl_lock(); ipmr_for_each_table(mrt, net) { - if (sk == mrt->mroute_sk) { + if (sk == rtnl_dereference(mrt->mroute_sk)) { IPV4_DEVCONF_ALL(net, MC_FORWARDING)--; - - write_lock_bh(&mrt_lock); - mrt->mroute_sk = NULL; - write_unlock_bh(&mrt_lock); - + rcu_assign_pointer(mrt->mroute_sk, NULL); mroute_clean_tables(mrt); } } @@ -1204,7 +1201,8 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi return -ENOENT; if (optname != MRT_INIT) { - if (sk != mrt->mroute_sk && !capable(CAP_NET_ADMIN)) + if (sk != rcu_dereference_raw(mrt->mroute_sk) && + !capable(CAP_NET_ADMIN)) return -EACCES; } @@ -1217,23 +1215,20 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi return -ENOPROTOOPT; rtnl_lock(); - if (mrt->mroute_sk) { + if (rtnl_dereference(mrt->mroute_sk)) { rtnl_unlock(); return -EADDRINUSE; } ret = ip_ra_control(sk, 1, mrtsock_destruct); if (ret == 0) { - write_lock_bh(&mrt_lock); - mrt->mroute_sk = sk; - write_unlock_bh(&mrt_lock); - + rcu_assign_pointer(mrt->mroute_sk, sk); IPV4_DEVCONF_ALL(net, MC_FORWARDING)++; } rtnl_unlock(); return ret; case MRT_DONE: - if (sk != mrt->mroute_sk) + if (sk != rcu_dereference_raw(mrt->mroute_sk)) return -EACCES; return ip_ra_control(sk, 0, NULL); case MRT_ADD_VIF: @@ -1246,7 +1241,8 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi return -ENFILE; rtnl_lock(); if (optname == MRT_ADD_VIF) { - ret = vif_add(net, mrt, &vif, sk == mrt->mroute_sk); + ret = vif_add(net, mrt, &vif, + sk == rtnl_dereference(mrt->mroute_sk)); } else { ret = vif_delete(mrt, vif.vifc_vifi, 0, NULL); } @@ -1267,7 +1263,8 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi if (optname == MRT_DEL_MFC) ret = ipmr_mfc_delete(mrt, &mfc); else - ret = ipmr_mfc_add(net, mrt, &mfc, sk == mrt->mroute_sk); + ret = ipmr_mfc_add(net, mrt, &mfc, + sk == rtnl_dereference(mrt->mroute_sk)); rtnl_unlock(); return ret; /* @@ -1276,7 +1273,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi case MRT_ASSERT: { int v; - if (get_user(v,(int __user *)optval)) + if (get_user(v, (int __user *)optval)) return -EFAULT; mrt->mroute_do_assert = (v) ? 1 : 0; return 0; @@ -1286,7 +1283,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi { int v; - if (get_user(v,(int __user *)optval)) + if (get_user(v, (int __user *)optval)) return -EFAULT; v = (v) ? 1 : 0; @@ -1309,14 +1306,16 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi return -EINVAL; if (get_user(v, (u32 __user *)optval)) return -EFAULT; - if (sk == mrt->mroute_sk) - return -EBUSY; rtnl_lock(); ret = 0; - if (!ipmr_new_table(net, v)) - ret = -ENOMEM; - raw_sk(sk)->ipmr_table = v; + if (sk == rtnl_dereference(mrt->mroute_sk)) { + ret = -EBUSY; + } else { + if (!ipmr_new_table(net, v)) + ret = -ENOMEM; + raw_sk(sk)->ipmr_table = v; + } rtnl_unlock(); return ret; } @@ -1347,9 +1346,9 @@ int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int if (optname != MRT_VERSION && #ifdef CONFIG_IP_PIMSM - optname!=MRT_PIM && + optname != MRT_PIM && #endif - optname!=MRT_ASSERT) + optname != MRT_ASSERT) return -ENOPROTOOPT; if (get_user(olr, optlen)) @@ -1416,19 +1415,19 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg) if (copy_from_user(&sr, arg, sizeof(sr))) return -EFAULT; - read_lock(&mrt_lock); + rcu_read_lock(); c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr); if (c) { sr.pktcnt = c->mfc_un.res.pkt; sr.bytecnt = c->mfc_un.res.bytes; sr.wrong_if = c->mfc_un.res.wrong_if; - read_unlock(&mrt_lock); + rcu_read_unlock(); if (copy_to_user(arg, &sr, sizeof(sr))) return -EFAULT; return 0; } - read_unlock(&mrt_lock); + rcu_read_unlock(); return -EADDRNOTAVAIL; default: return -ENOIOCTLCMD; @@ -1465,7 +1464,7 @@ static struct notifier_block ip_mr_notifier = { }; /* - * Encapsulate a packet by attaching a valid IPIP header to it. + * Encapsulate a packet by attaching a valid IPIP header to it. * This avoids tunnel drivers and other mess and gives us the speed so * important for multicast video. */ @@ -1480,7 +1479,7 @@ static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr) skb_reset_network_header(skb); iph = ip_hdr(skb); - iph->version = 4; + iph->version = 4; iph->tos = old_iph->tos; iph->ttl = old_iph->ttl; iph->frag_off = 0; @@ -1498,7 +1497,7 @@ static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr) static inline int ipmr_forward_finish(struct sk_buff *skb) { - struct ip_options * opt = &(IPCB(skb)->opt); + struct ip_options *opt = &(IPCB(skb)->opt); IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS); @@ -1535,32 +1534,44 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, } #endif - if (vif->flags&VIFF_TUNNEL) { - struct flowi fl = { .oif = vif->link, - .nl_u = { .ip4_u = - { .daddr = vif->remote, - .saddr = vif->local, - .tos = RT_TOS(iph->tos) } }, - .proto = IPPROTO_IPIP }; + if (vif->flags & VIFF_TUNNEL) { + struct flowi fl = { + .oif = vif->link, + .nl_u = { + .ip4_u = { + .daddr = vif->remote, + .saddr = vif->local, + .tos = RT_TOS(iph->tos) + } + }, + .proto = IPPROTO_IPIP + }; + if (ip_route_output_key(net, &rt, &fl)) goto out_free; encap = sizeof(struct iphdr); } else { - struct flowi fl = { .oif = vif->link, - .nl_u = { .ip4_u = - { .daddr = iph->daddr, - .tos = RT_TOS(iph->tos) } }, - .proto = IPPROTO_IPIP }; + struct flowi fl = { + .oif = vif->link, + .nl_u = { + .ip4_u = { + .daddr = iph->daddr, + .tos = RT_TOS(iph->tos) + } + }, + .proto = IPPROTO_IPIP + }; + if (ip_route_output_key(net, &rt, &fl)) goto out_free; } - dev = rt->u.dst.dev; + dev = rt->dst.dev; - if (skb->len+encap > dst_mtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) { + if (skb->len+encap > dst_mtu(&rt->dst) && (ntohs(iph->frag_off) & IP_DF)) { /* Do not fragment multicasts. Alas, IPv4 does not - allow to send ICMP, so that packets will disappear - to blackhole. + * allow to send ICMP, so that packets will disappear + * to blackhole. */ IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_FRAGFAILS); @@ -1568,7 +1579,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, goto out_free; } - encap += LL_RESERVED_SPACE(dev) + rt->u.dst.header_len; + encap += LL_RESERVED_SPACE(dev) + rt->dst.header_len; if (skb_cow(skb, encap)) { ip_rt_put(rt); @@ -1579,11 +1590,12 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, vif->bytes_out += skb->len; skb_dst_drop(skb); - skb_dst_set(skb, &rt->u.dst); + skb_dst_set(skb, &rt->dst); ip_decrease_ttl(ip_hdr(skb)); /* FIXME: forward and output firewalls used to be called here. - * What do we do with netfilter? -- RR */ + * What do we do with netfilter? -- RR + */ if (vif->flags & VIFF_TUNNEL) { ip_encap(skb, vif->local, vif->remote); /* FIXME: extra output firewall step used to be here. --RR */ @@ -1644,15 +1656,15 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt, if (skb_rtable(skb)->fl.iif == 0) { /* It is our own packet, looped back. - Very complicated situation... - - The best workaround until routing daemons will be - fixed is not to redistribute packet, if it was - send through wrong interface. It means, that - multicast applications WILL NOT work for - (S,G), which have default multicast route pointing - to wrong oif. In any case, it is not a good - idea to use multicasting applications on router. + * Very complicated situation... + * + * The best workaround until routing daemons will be + * fixed is not to redistribute packet, if it was + * send through wrong interface. It means, that + * multicast applications WILL NOT work for + * (S,G), which have default multicast route pointing + * to wrong oif. In any case, it is not a good + * idea to use multicasting applications on router. */ goto dont_forward; } @@ -1662,9 +1674,9 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt, if (true_vifi >= 0 && mrt->mroute_do_assert && /* pimsm uses asserts, when switching from RPT to SPT, - so that we cannot check that packet arrived on an oif. - It is bad, but otherwise we would need to move pretty - large chunk of pimd to kernel. Ough... --ANK + * so that we cannot check that packet arrived on an oif. + * It is bad, but otherwise we would need to move pretty + * large chunk of pimd to kernel. Ough... --ANK */ (mrt->mroute_do_pim || cache->mfc_un.res.ttls[true_vifi] < 255) && @@ -1682,10 +1694,12 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt, /* * Forward the frame */ - for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) { + for (ct = cache->mfc_un.res.maxvif - 1; + ct >= cache->mfc_un.res.minvif; ct--) { if (ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) { if (psend != -1) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); + if (skb2) ipmr_queue_xmit(net, mrt, skb2, cache, psend); @@ -1696,6 +1710,7 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt, if (psend != -1) { if (local) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); + if (skb2) ipmr_queue_xmit(net, mrt, skb2, cache, psend); } else { @@ -1713,6 +1728,7 @@ dont_forward: /* * Multicast packets for forwarding arrive here + * Called with rcu_read_lock(); */ int ip_mr_input(struct sk_buff *skb) @@ -1724,9 +1740,9 @@ int ip_mr_input(struct sk_buff *skb) int err; /* Packet is looped back after forward, it should not be - forwarded second time, but still can be delivered locally. + * forwarded second time, but still can be delivered locally. */ - if (IPCB(skb)->flags&IPSKB_FORWARDED) + if (IPCB(skb)->flags & IPSKB_FORWARDED) goto dont_forward; err = ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt); @@ -1736,28 +1752,28 @@ int ip_mr_input(struct sk_buff *skb) } if (!local) { - if (IPCB(skb)->opt.router_alert) { - if (ip_call_ra_chain(skb)) - return 0; - } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP){ - /* IGMPv1 (and broken IGMPv2 implementations sort of - Cisco IOS <= 11.2(8)) do not put router alert - option to IGMP packets destined to routable - groups. It is very bad, because it means - that we can forward NO IGMP messages. - */ - read_lock(&mrt_lock); - if (mrt->mroute_sk) { - nf_reset(skb); - raw_rcv(mrt->mroute_sk, skb); - read_unlock(&mrt_lock); - return 0; - } - read_unlock(&mrt_lock); + if (IPCB(skb)->opt.router_alert) { + if (ip_call_ra_chain(skb)) + return 0; + } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP) { + /* IGMPv1 (and broken IGMPv2 implementations sort of + * Cisco IOS <= 11.2(8)) do not put router alert + * option to IGMP packets destined to routable + * groups. It is very bad, because it means + * that we can forward NO IGMP messages. + */ + struct sock *mroute_sk; + + mroute_sk = rcu_dereference(mrt->mroute_sk); + if (mroute_sk) { + nf_reset(skb); + raw_rcv(mroute_sk, skb); + return 0; + } } } - read_lock(&mrt_lock); + /* already under rcu_read_lock() */ cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr); /* @@ -1769,13 +1785,12 @@ int ip_mr_input(struct sk_buff *skb) if (local) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); ip_local_deliver(skb); - if (skb2 == NULL) { - read_unlock(&mrt_lock); + if (skb2 == NULL) return -ENOBUFS; - } skb = skb2; } + read_lock(&mrt_lock); vif = ipmr_find_vif(mrt, skb->dev); if (vif >= 0) { int err2 = ipmr_cache_unresolved(mrt, vif, skb); @@ -1788,8 +1803,8 @@ int ip_mr_input(struct sk_buff *skb) return -ENODEV; } + read_lock(&mrt_lock); ip_mr_forward(net, mrt, skb, cache, local); - read_unlock(&mrt_lock); if (local) @@ -1805,6 +1820,7 @@ dont_forward: } #ifdef CONFIG_IP_PIMSM +/* called with rcu_read_lock() */ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb, unsigned int pimlen) { @@ -1813,10 +1829,10 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb, encap = (struct iphdr *)(skb_transport_header(skb) + pimlen); /* - Check that: - a. packet is really destinted to a multicast group - b. packet is not a NULL-REGISTER - c. packet is not truncated + * Check that: + * a. packet is really sent to a multicast group + * b. packet is not a NULL-REGISTER + * c. packet is not truncated */ if (!ipv4_is_multicast(encap->daddr) || encap->tot_len == 0 || @@ -1826,26 +1842,23 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb, read_lock(&mrt_lock); if (mrt->mroute_reg_vif_num >= 0) reg_dev = mrt->vif_table[mrt->mroute_reg_vif_num].dev; - if (reg_dev) - dev_hold(reg_dev); read_unlock(&mrt_lock); if (reg_dev == NULL) return 1; skb->mac_header = skb->network_header; - skb_pull(skb, (u8*)encap - skb->data); + skb_pull(skb, (u8 *)encap - skb->data); skb_reset_network_header(skb); skb->protocol = htons(ETH_P_IP); - skb->ip_summed = 0; + skb->ip_summed = CHECKSUM_NONE; skb->pkt_type = PACKET_HOST; skb_tunnel_rx(skb, reg_dev); netif_rx(skb); - dev_put(reg_dev); - return 0; + return NET_RX_SUCCESS; } #endif @@ -1854,7 +1867,7 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb, * Handle IGMP messages of PIMv1 */ -int pim_rcv_v1(struct sk_buff * skb) +int pim_rcv_v1(struct sk_buff *skb) { struct igmphdr *pim; struct net *net = dev_net(skb->dev); @@ -1881,7 +1894,7 @@ drop: #endif #ifdef CONFIG_IP_PIMSM_V2 -static int pim_rcv(struct sk_buff * skb) +static int pim_rcv(struct sk_buff *skb) { struct pimreghdr *pim; struct net *net = dev_net(skb->dev); @@ -1891,8 +1904,8 @@ static int pim_rcv(struct sk_buff * skb) goto drop; pim = (struct pimreghdr *)skb_transport_header(skb); - if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) || - (pim->flags&PIM_NULL_REGISTER) || + if (pim->type != ((PIM_VERSION << 4) | (PIM_REGISTER)) || + (pim->flags & PIM_NULL_REGISTER) || (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 && csum_fold(skb_checksum(skb, 0, skb->len, 0)))) goto drop; @@ -1958,28 +1971,33 @@ int ipmr_get_route(struct net *net, if (mrt == NULL) return -ENOENT; - read_lock(&mrt_lock); + rcu_read_lock(); cache = ipmr_cache_find(mrt, rt->rt_src, rt->rt_dst); if (cache == NULL) { struct sk_buff *skb2; struct iphdr *iph; struct net_device *dev; - int vif; + int vif = -1; if (nowait) { - read_unlock(&mrt_lock); + rcu_read_unlock(); return -EAGAIN; } dev = skb->dev; - if (dev == NULL || (vif = ipmr_find_vif(mrt, dev)) < 0) { + read_lock(&mrt_lock); + if (dev) + vif = ipmr_find_vif(mrt, dev); + if (vif < 0) { read_unlock(&mrt_lock); + rcu_read_unlock(); return -ENODEV; } skb2 = skb_clone(skb, GFP_ATOMIC); if (!skb2) { read_unlock(&mrt_lock); + rcu_read_unlock(); return -ENOMEM; } @@ -1992,13 +2010,16 @@ int ipmr_get_route(struct net *net, iph->version = 0; err = ipmr_cache_unresolved(mrt, vif, skb2); read_unlock(&mrt_lock); + rcu_read_unlock(); return err; } - if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY)) + read_lock(&mrt_lock); + if (!nowait && (rtm->rtm_flags & RTM_F_NOTIFY)) cache->mfc_flags |= MFC_NOTIFY; err = __ipmr_fill_mroute(mrt, skb, cache, rtm); read_unlock(&mrt_lock); + rcu_read_unlock(); return err; } @@ -2050,14 +2071,14 @@ static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) s_h = cb->args[1]; s_e = cb->args[2]; - read_lock(&mrt_lock); + rcu_read_lock(); ipmr_for_each_table(mrt, net) { if (t < s_t) goto next_table; if (t > s_t) s_h = 0; for (h = s_h; h < MFC_LINES; h++) { - list_for_each_entry(mfc, &mrt->mfc_cache_array[h], list) { + list_for_each_entry_rcu(mfc, &mrt->mfc_cache_array[h], list) { if (e < s_e) goto next_entry; if (ipmr_fill_mroute(mrt, skb, @@ -2075,7 +2096,7 @@ next_table: t++; } done: - read_unlock(&mrt_lock); + rcu_read_unlock(); cb->args[2] = e; cb->args[1] = h; @@ -2086,7 +2107,8 @@ done: #ifdef CONFIG_PROC_FS /* - * The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif + * The /proc interfaces to multicast routing : + * /proc/net/ip_mr_cache & /proc/net/ip_mr_vif */ struct ipmr_vif_iter { struct seq_net_private p; @@ -2208,14 +2230,14 @@ static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net, struct mr_table *mrt = it->mrt; struct mfc_cache *mfc; - read_lock(&mrt_lock); + rcu_read_lock(); for (it->ct = 0; it->ct < MFC_LINES; it->ct++) { it->cache = &mrt->mfc_cache_array[it->ct]; - list_for_each_entry(mfc, it->cache, list) + list_for_each_entry_rcu(mfc, it->cache, list) if (pos-- == 0) return mfc; } - read_unlock(&mrt_lock); + rcu_read_unlock(); spin_lock_bh(&mfc_unres_lock); it->cache = &mrt->mfc_unres_queue; @@ -2274,7 +2296,7 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos) } /* exhausted cache_array, show unresolved */ - read_unlock(&mrt_lock); + rcu_read_unlock(); it->cache = &mrt->mfc_unres_queue; it->ct = 0; @@ -2282,7 +2304,7 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos) if (!list_empty(it->cache)) return list_first_entry(it->cache, struct mfc_cache, list); - end_of_list: +end_of_list: spin_unlock_bh(&mfc_unres_lock); it->cache = NULL; @@ -2297,7 +2319,7 @@ static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v) if (it->cache == &mrt->mfc_unres_queue) spin_unlock_bh(&mfc_unres_lock); else if (it->cache == &mrt->mfc_cache_array[it->ct]) - read_unlock(&mrt_lock); + rcu_read_unlock(); } static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) @@ -2323,7 +2345,7 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) mfc->mfc_un.res.bytes, mfc->mfc_un.res.wrong_if); for (n = mfc->mfc_un.res.minvif; - n < mfc->mfc_un.res.maxvif; n++ ) { + n < mfc->mfc_un.res.maxvif; n++) { if (VIF_EXISTS(mrt, n) && mfc->mfc_un.res.ttls[n] < 255) seq_printf(seq, @@ -2421,7 +2443,7 @@ int __init ip_mr_init(void) mrt_cachep = kmem_cache_create("ip_mrt_cache", sizeof(struct mfc_cache), - 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, + 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); if (!mrt_cachep) return -ENOMEM; diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index 07de855e2175..d88a46c54fd1 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -43,7 +43,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) /* Drop old route. */ skb_dst_drop(skb); - skb_dst_set(skb, &rt->u.dst); + skb_dst_set(skb, &rt->dst); } else { /* non-local src, find valid iif to satisfy * rp-filter when calling ip_route_input. */ @@ -53,11 +53,11 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) orefdst = skb->_skb_refdst; if (ip_route_input(skb, iph->daddr, iph->saddr, - RT_TOS(iph->tos), rt->u.dst.dev) != 0) { - dst_release(&rt->u.dst); + RT_TOS(iph->tos), rt->dst.dev) != 0) { + dst_release(&rt->dst); return -1; } - dst_release(&rt->u.dst); + dst_release(&rt->dst); refdst_drop(orefdst); } @@ -212,9 +212,7 @@ static __sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook, skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, protocol, skb->len - dataoff, 0); skb->ip_summed = CHECKSUM_NONE; - csum = __skb_checksum_complete_head(skb, dataoff + len); - if (!csum) - skb->ip_summed = CHECKSUM_UNNECESSARY; + return __skb_checksum_complete_head(skb, dataoff + len); } return csum; } diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 1833bdbf9805..babd1a2bae5f 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -147,7 +147,7 @@ config IP_NF_TARGET_ULOG which can only be viewed through syslog. The appropriate userspace logging daemon (ulogd) may be obtained from - <http://www.gnumonks.org/projects/ulogd/> + <http://www.netfilter.org/projects/ulogd/index.html> To compile it as a module, choose M here. If unsure, say N. @@ -324,10 +324,10 @@ config IP_NF_TARGET_ECN config IP_NF_TARGET_TTL tristate '"TTL" target support' - depends on NETFILTER_ADVANCED + depends on NETFILTER_ADVANCED && IP_NF_MANGLE select NETFILTER_XT_TARGET_HL ---help--- - This is a backwards-compat option for the user's convenience + This is a backwards-compatible option for the user's convenience (e.g. when running oldconfig). It selects CONFIG_NETFILTER_XT_TARGET_HL. diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 1ac01b128621..3cad2591ace0 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -72,7 +72,7 @@ static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap, for (i = 0; i < len; i++) ret |= (hdr_addr[i] ^ ap->addr[i]) & ap->mask[i]; - return (ret != 0); + return ret != 0; } /* @@ -228,7 +228,7 @@ arpt_error(struct sk_buff *skb, const struct xt_action_param *par) return NF_DROP; } -static inline const struct arpt_entry_target * +static inline const struct xt_entry_target * arpt_get_target_c(const struct arpt_entry *e) { return arpt_get_target((struct arpt_entry *)e); @@ -282,17 +282,14 @@ unsigned int arpt_do_table(struct sk_buff *skb, arp = arp_hdr(skb); do { - const struct arpt_entry_target *t; - int hdr_len; + const struct xt_entry_target *t; if (!arp_packet_match(arp, skb->dev, indev, outdev, &e->arp)) { e = arpt_next_entry(e); continue; } - hdr_len = sizeof(*arp) + (2 * sizeof(struct in_addr)) + - (2 * skb->dev->addr_len); - ADD_COUNTER(e->counters, hdr_len, 1); + ADD_COUNTER(e->counters, arp_hdr_len(skb->dev), 1); t = arpt_get_target_c(e); @@ -300,10 +297,10 @@ unsigned int arpt_do_table(struct sk_buff *skb, if (!t->u.kernel.target->target) { int v; - v = ((struct arpt_standard_target *)t)->verdict; + v = ((struct xt_standard_target *)t)->verdict; if (v < 0) { /* Pop from stack? */ - if (v != ARPT_RETURN) { + if (v != XT_RETURN) { verdict = (unsigned)(-v) - 1; break; } @@ -335,7 +332,7 @@ unsigned int arpt_do_table(struct sk_buff *skb, /* Target might have changed stuff. */ arp = arp_hdr(skb); - if (verdict == ARPT_CONTINUE) + if (verdict == XT_CONTINUE) e = arpt_next_entry(e); else /* Verdict */ @@ -380,7 +377,7 @@ static int mark_source_chains(const struct xt_table_info *newinfo, e->counters.pcnt = pos; for (;;) { - const struct arpt_standard_target *t + const struct xt_standard_target *t = (void *)arpt_get_target_c(e); int visited = e->comefrom & (1 << hook); @@ -395,13 +392,13 @@ static int mark_source_chains(const struct xt_table_info *newinfo, /* Unconditional return/END. */ if ((e->target_offset == sizeof(struct arpt_entry) && (strcmp(t->target.u.user.name, - ARPT_STANDARD_TARGET) == 0) && + XT_STANDARD_TARGET) == 0) && t->verdict < 0 && unconditional(&e->arp)) || visited) { unsigned int oldpos, size; if ((strcmp(t->target.u.user.name, - ARPT_STANDARD_TARGET) == 0) && + XT_STANDARD_TARGET) == 0) && t->verdict < -NF_MAX_VERDICT - 1) { duprintf("mark_source_chains: bad " "negative verdict (%i)\n", @@ -436,7 +433,7 @@ static int mark_source_chains(const struct xt_table_info *newinfo, int newpos = t->verdict; if (strcmp(t->target.u.user.name, - ARPT_STANDARD_TARGET) == 0 && + XT_STANDARD_TARGET) == 0 && newpos >= 0) { if (newpos > newinfo->size - sizeof(struct arpt_entry)) { @@ -467,14 +464,14 @@ static int mark_source_chains(const struct xt_table_info *newinfo, static inline int check_entry(const struct arpt_entry *e, const char *name) { - const struct arpt_entry_target *t; + const struct xt_entry_target *t; if (!arp_checkentry(&e->arp)) { duprintf("arp_tables: arp check failed %p %s.\n", e, name); return -EINVAL; } - if (e->target_offset + sizeof(struct arpt_entry_target) > e->next_offset) + if (e->target_offset + sizeof(struct xt_entry_target) > e->next_offset) return -EINVAL; t = arpt_get_target_c(e); @@ -486,7 +483,7 @@ static inline int check_entry(const struct arpt_entry *e, const char *name) static inline int check_target(struct arpt_entry *e, const char *name) { - struct arpt_entry_target *t = arpt_get_target(e); + struct xt_entry_target *t = arpt_get_target(e); int ret; struct xt_tgchk_param par = { .table = name, @@ -509,7 +506,7 @@ static inline int check_target(struct arpt_entry *e, const char *name) static inline int find_check_entry(struct arpt_entry *e, const char *name, unsigned int size) { - struct arpt_entry_target *t; + struct xt_entry_target *t; struct xt_target *target; int ret; @@ -539,7 +536,7 @@ out: static bool check_underflow(const struct arpt_entry *e) { - const struct arpt_entry_target *t; + const struct xt_entry_target *t; unsigned int verdict; if (!unconditional(&e->arp)) @@ -547,7 +544,7 @@ static bool check_underflow(const struct arpt_entry *e) t = arpt_get_target_c(e); if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0) return false; - verdict = ((struct arpt_standard_target *)t)->verdict; + verdict = ((struct xt_standard_target *)t)->verdict; verdict = -verdict - 1; return verdict == NF_DROP || verdict == NF_ACCEPT; } @@ -569,7 +566,7 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e, } if (e->next_offset - < sizeof(struct arpt_entry) + sizeof(struct arpt_entry_target)) { + < sizeof(struct arpt_entry) + sizeof(struct xt_entry_target)) { duprintf("checking: element %p size %u\n", e, e->next_offset); return -EINVAL; @@ -601,7 +598,7 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e, static inline void cleanup_entry(struct arpt_entry *e) { struct xt_tgdtor_param par; - struct arpt_entry_target *t; + struct xt_entry_target *t; t = arpt_get_target(e); par.target = t->u.kernel.target; @@ -713,7 +710,7 @@ static void get_counters(const struct xt_table_info *t, struct arpt_entry *iter; unsigned int cpu; unsigned int i; - unsigned int curcpu; + unsigned int curcpu = get_cpu(); /* Instead of clearing (by a previous call to memset()) * the counters and using adds, we set the counters @@ -723,19 +720,22 @@ static void get_counters(const struct xt_table_info *t, * if new softirq were to run and call ipt_do_table */ local_bh_disable(); - curcpu = smp_processor_id(); - i = 0; xt_entry_foreach(iter, t->entries[curcpu], t->size) { SET_COUNTER(counters[i], iter->counters.bcnt, iter->counters.pcnt); ++i; } + local_bh_enable(); + /* Processing counters from other cpus, we can let bottom half enabled, + * (preemption is disabled) + */ for_each_possible_cpu(cpu) { if (cpu == curcpu) continue; i = 0; + local_bh_disable(); xt_info_wrlock(cpu); xt_entry_foreach(iter, t->entries[cpu], t->size) { ADD_COUNTER(counters[i], iter->counters.bcnt, @@ -743,8 +743,9 @@ static void get_counters(const struct xt_table_info *t, ++i; } xt_info_wrunlock(cpu); + local_bh_enable(); } - local_bh_enable(); + put_cpu(); } static struct xt_counters *alloc_counters(const struct xt_table *table) @@ -758,7 +759,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table) * about). */ countersize = sizeof(struct xt_counters) * private->number; - counters = vmalloc_node(countersize, numa_node_id()); + counters = vmalloc(countersize); if (counters == NULL) return ERR_PTR(-ENOMEM); @@ -793,7 +794,7 @@ static int copy_entries_to_user(unsigned int total_size, /* FIXME: use iterator macros --RR */ /* ... then go back and fix counters and names */ for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){ - const struct arpt_entry_target *t; + const struct xt_entry_target *t; e = (struct arpt_entry *)(loc_cpu_entry + off); if (copy_to_user(userptr + off @@ -806,7 +807,7 @@ static int copy_entries_to_user(unsigned int total_size, t = arpt_get_target_c(e); if (copy_to_user(userptr + off + e->target_offset - + offsetof(struct arpt_entry_target, + + offsetof(struct xt_entry_target, u.user.name), t->u.kernel.target->name, strlen(t->u.kernel.target->name)+1) != 0) { @@ -843,7 +844,7 @@ static int compat_calc_entry(const struct arpt_entry *e, const struct xt_table_info *info, const void *base, struct xt_table_info *newinfo) { - const struct arpt_entry_target *t; + const struct xt_entry_target *t; unsigned int entry_offset; int off, i, ret; @@ -894,7 +895,7 @@ static int compat_table_info(const struct xt_table_info *info, static int get_info(struct net *net, void __user *user, const int *len, int compat) { - char name[ARPT_TABLE_MAXNAMELEN]; + char name[XT_TABLE_MAXNAMELEN]; struct xt_table *t; int ret; @@ -907,7 +908,7 @@ static int get_info(struct net *net, void __user *user, if (copy_from_user(name, user, sizeof(name)) != 0) return -EFAULT; - name[ARPT_TABLE_MAXNAMELEN-1] = '\0'; + name[XT_TABLE_MAXNAMELEN-1] = '\0'; #ifdef CONFIG_COMPAT if (compat) xt_compat_lock(NFPROTO_ARP); @@ -1005,8 +1006,7 @@ static int __do_replace(struct net *net, const char *name, struct arpt_entry *iter; ret = 0; - counters = vmalloc_node(num_counters * sizeof(struct xt_counters), - numa_node_id()); + counters = vmalloc(num_counters * sizeof(struct xt_counters)); if (!counters) { ret = -ENOMEM; goto out; @@ -1159,7 +1159,7 @@ static int do_add_counters(struct net *net, const void __user *user, if (len != size + num_counters * sizeof(struct xt_counters)) return -EINVAL; - paddc = vmalloc_node(len - size, numa_node_id()); + paddc = vmalloc(len - size); if (!paddc) return -ENOMEM; @@ -1204,7 +1204,7 @@ static int do_add_counters(struct net *net, const void __user *user, #ifdef CONFIG_COMPAT static inline void compat_release_entry(struct compat_arpt_entry *e) { - struct arpt_entry_target *t; + struct xt_entry_target *t; t = compat_arpt_get_target(e); module_put(t->u.kernel.target->me); @@ -1220,7 +1220,7 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e, const unsigned int *underflows, const char *name) { - struct arpt_entry_target *t; + struct xt_entry_target *t; struct xt_target *target; unsigned int entry_offset; int ret, off, h; @@ -1288,7 +1288,7 @@ compat_copy_entry_from_user(struct compat_arpt_entry *e, void **dstptr, unsigned int *size, const char *name, struct xt_table_info *newinfo, unsigned char *base) { - struct arpt_entry_target *t; + struct xt_entry_target *t; struct xt_target *target; struct arpt_entry *de; unsigned int origsize; @@ -1420,6 +1420,9 @@ static int translate_compat_table(const char *name, if (ret != 0) break; ++i; + if (strcmp(arpt_get_target(iter1)->u.user.name, + XT_ERROR_TARGET) == 0) + ++newinfo->stacksize; } if (ret) { /* @@ -1471,7 +1474,7 @@ out_unlock: } struct compat_arpt_replace { - char name[ARPT_TABLE_MAXNAMELEN]; + char name[XT_TABLE_MAXNAMELEN]; u32 valid_hooks; u32 num_entries; u32 size; @@ -1564,7 +1567,7 @@ static int compat_copy_entry_to_user(struct arpt_entry *e, void __user **dstptr, struct xt_counters *counters, unsigned int i) { - struct arpt_entry_target *t; + struct xt_entry_target *t; struct compat_arpt_entry __user *ce; u_int16_t target_offset, next_offset; compat_uint_t origsize; @@ -1625,7 +1628,7 @@ static int compat_copy_entries_to_user(unsigned int total_size, } struct compat_arpt_get_entries { - char name[ARPT_TABLE_MAXNAMELEN]; + char name[XT_TABLE_MAXNAMELEN]; compat_uint_t size; struct compat_arpt_entry entrytable[0]; }; @@ -1825,7 +1828,7 @@ void arpt_unregister_table(struct xt_table *table) /* The built-in targets: standard (NULL) and error. */ static struct xt_target arpt_builtin_tg[] __read_mostly = { { - .name = ARPT_STANDARD_TARGET, + .name = XT_STANDARD_TARGET, .targetsize = sizeof(int), .family = NFPROTO_ARP, #ifdef CONFIG_COMPAT @@ -1835,9 +1838,9 @@ static struct xt_target arpt_builtin_tg[] __read_mostly = { #endif }, { - .name = ARPT_ERROR_TARGET, + .name = XT_ERROR_TARGET, .target = arpt_error, - .targetsize = ARPT_FUNCTION_MAXNAMELEN, + .targetsize = XT_FUNCTION_MAXNAMELEN, .family = NFPROTO_ARP, }, }; diff --git a/net/ipv4/netfilter/arpt_mangle.c b/net/ipv4/netfilter/arpt_mangle.c index e1be7dd1171b..b8ddcc480ed9 100644 --- a/net/ipv4/netfilter/arpt_mangle.c +++ b/net/ipv4/netfilter/arpt_mangle.c @@ -63,7 +63,7 @@ static int checkentry(const struct xt_tgchk_param *par) return false; if (mangle->target != NF_DROP && mangle->target != NF_ACCEPT && - mangle->target != ARPT_CONTINUE) + mangle->target != XT_CONTINUE) return false; return true; } diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c index a4e5fc5df4bf..d2c1311cb28d 100644 --- a/net/ipv4/netfilter/ip_queue.c +++ b/net/ipv4/netfilter/ip_queue.c @@ -42,7 +42,7 @@ typedef int (*ipq_cmpfn)(struct nf_queue_entry *, unsigned long); static unsigned char copy_mode __read_mostly = IPQ_COPY_NONE; static unsigned int queue_maxlen __read_mostly = IPQ_QMAX_DEFAULT; -static DEFINE_RWLOCK(queue_lock); +static DEFINE_SPINLOCK(queue_lock); static int peer_pid __read_mostly; static unsigned int copy_range __read_mostly; static unsigned int queue_total; @@ -72,10 +72,10 @@ __ipq_set_mode(unsigned char mode, unsigned int range) break; case IPQ_COPY_PACKET: - copy_mode = mode; + if (range > 0xFFFF) + range = 0xFFFF; copy_range = range; - if (copy_range > 0xFFFF) - copy_range = 0xFFFF; + copy_mode = mode; break; default: @@ -101,7 +101,7 @@ ipq_find_dequeue_entry(unsigned long id) { struct nf_queue_entry *entry = NULL, *i; - write_lock_bh(&queue_lock); + spin_lock_bh(&queue_lock); list_for_each_entry(i, &queue_list, list) { if ((unsigned long)i == id) { @@ -115,7 +115,7 @@ ipq_find_dequeue_entry(unsigned long id) queue_total--; } - write_unlock_bh(&queue_lock); + spin_unlock_bh(&queue_lock); return entry; } @@ -136,9 +136,9 @@ __ipq_flush(ipq_cmpfn cmpfn, unsigned long data) static void ipq_flush(ipq_cmpfn cmpfn, unsigned long data) { - write_lock_bh(&queue_lock); + spin_lock_bh(&queue_lock); __ipq_flush(cmpfn, data); - write_unlock_bh(&queue_lock); + spin_unlock_bh(&queue_lock); } static struct sk_buff * @@ -152,9 +152,7 @@ ipq_build_packet_message(struct nf_queue_entry *entry, int *errp) struct nlmsghdr *nlh; struct timeval tv; - read_lock_bh(&queue_lock); - - switch (copy_mode) { + switch (ACCESS_ONCE(copy_mode)) { case IPQ_COPY_META: case IPQ_COPY_NONE: size = NLMSG_SPACE(sizeof(*pmsg)); @@ -162,26 +160,21 @@ ipq_build_packet_message(struct nf_queue_entry *entry, int *errp) case IPQ_COPY_PACKET: if (entry->skb->ip_summed == CHECKSUM_PARTIAL && - (*errp = skb_checksum_help(entry->skb))) { - read_unlock_bh(&queue_lock); + (*errp = skb_checksum_help(entry->skb))) return NULL; - } - if (copy_range == 0 || copy_range > entry->skb->len) + + data_len = ACCESS_ONCE(copy_range); + if (data_len == 0 || data_len > entry->skb->len) data_len = entry->skb->len; - else - data_len = copy_range; size = NLMSG_SPACE(sizeof(*pmsg) + data_len); break; default: *errp = -EINVAL; - read_unlock_bh(&queue_lock); return NULL; } - read_unlock_bh(&queue_lock); - skb = alloc_skb(size, GFP_ATOMIC); if (!skb) goto nlmsg_failure; @@ -242,7 +235,7 @@ ipq_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) if (nskb == NULL) return status; - write_lock_bh(&queue_lock); + spin_lock_bh(&queue_lock); if (!peer_pid) goto err_out_free_nskb; @@ -266,14 +259,14 @@ ipq_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) __ipq_enqueue_entry(entry); - write_unlock_bh(&queue_lock); + spin_unlock_bh(&queue_lock); return status; err_out_free_nskb: kfree_skb(nskb); err_out_unlock: - write_unlock_bh(&queue_lock); + spin_unlock_bh(&queue_lock); return status; } @@ -342,9 +335,9 @@ ipq_set_mode(unsigned char mode, unsigned int range) { int status; - write_lock_bh(&queue_lock); + spin_lock_bh(&queue_lock); status = __ipq_set_mode(mode, range); - write_unlock_bh(&queue_lock); + spin_unlock_bh(&queue_lock); return status; } @@ -440,11 +433,11 @@ __ipq_rcv_skb(struct sk_buff *skb) if (security_netlink_recv(skb, CAP_NET_ADMIN)) RCV_SKB_FAIL(-EPERM); - write_lock_bh(&queue_lock); + spin_lock_bh(&queue_lock); if (peer_pid) { if (peer_pid != pid) { - write_unlock_bh(&queue_lock); + spin_unlock_bh(&queue_lock); RCV_SKB_FAIL(-EBUSY); } } else { @@ -452,7 +445,7 @@ __ipq_rcv_skb(struct sk_buff *skb) peer_pid = pid; } - write_unlock_bh(&queue_lock); + spin_unlock_bh(&queue_lock); status = ipq_receive_peer(NLMSG_DATA(nlh), type, nlmsglen - NLMSG_LENGTH(0)); @@ -497,10 +490,10 @@ ipq_rcv_nl_event(struct notifier_block *this, struct netlink_notify *n = ptr; if (event == NETLINK_URELEASE && n->protocol == NETLINK_FIREWALL) { - write_lock_bh(&queue_lock); + spin_lock_bh(&queue_lock); if ((net_eq(n->net, &init_net)) && (n->pid == peer_pid)) __ipq_reset(); - write_unlock_bh(&queue_lock); + spin_unlock_bh(&queue_lock); } return NOTIFY_DONE; } @@ -527,7 +520,7 @@ static ctl_table ipq_table[] = { #ifdef CONFIG_PROC_FS static int ip_queue_show(struct seq_file *m, void *v) { - read_lock_bh(&queue_lock); + spin_lock_bh(&queue_lock); seq_printf(m, "Peer PID : %d\n" @@ -545,7 +538,7 @@ static int ip_queue_show(struct seq_file *m, void *v) queue_dropped, queue_user_dropped); - read_unlock_bh(&queue_lock); + spin_unlock_bh(&queue_lock); return 0; } diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 4b6c5ca610fc..d31b007a6d80 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -186,7 +186,7 @@ static inline bool unconditional(const struct ipt_ip *ip) } /* for const-correctness */ -static inline const struct ipt_entry_target * +static inline const struct xt_entry_target * ipt_get_target_c(const struct ipt_entry *e) { return ipt_get_target((struct ipt_entry *)e); @@ -230,9 +230,9 @@ get_chainname_rulenum(const struct ipt_entry *s, const struct ipt_entry *e, const char *hookname, const char **chainname, const char **comment, unsigned int *rulenum) { - const struct ipt_standard_target *t = (void *)ipt_get_target_c(s); + const struct xt_standard_target *t = (void *)ipt_get_target_c(s); - if (strcmp(t->target.u.kernel.target->name, IPT_ERROR_TARGET) == 0) { + if (strcmp(t->target.u.kernel.target->name, XT_ERROR_TARGET) == 0) { /* Head of user chain: ERROR target with chainname */ *chainname = t->target.data; (*rulenum) = 0; @@ -241,7 +241,7 @@ get_chainname_rulenum(const struct ipt_entry *s, const struct ipt_entry *e, if (s->target_offset == sizeof(struct ipt_entry) && strcmp(t->target.u.kernel.target->name, - IPT_STANDARD_TARGET) == 0 && + XT_STANDARD_TARGET) == 0 && t->verdict < 0 && unconditional(&s->ip)) { /* Tail of chains: STANDARD target (return/policy) */ @@ -346,7 +346,7 @@ ipt_do_table(struct sk_buff *skb, get_entry(table_base, private->underflow[hook])); do { - const struct ipt_entry_target *t; + const struct xt_entry_target *t; const struct xt_entry_match *ematch; IP_NF_ASSERT(e); @@ -364,7 +364,7 @@ ipt_do_table(struct sk_buff *skb, goto no_match; } - ADD_COUNTER(e->counters, ntohs(ip->tot_len), 1); + ADD_COUNTER(e->counters, skb->len, 1); t = ipt_get_target(e); IP_NF_ASSERT(t->u.kernel.target); @@ -380,10 +380,10 @@ ipt_do_table(struct sk_buff *skb, if (!t->u.kernel.target->target) { int v; - v = ((struct ipt_standard_target *)t)->verdict; + v = ((struct xt_standard_target *)t)->verdict; if (v < 0) { /* Pop from stack? */ - if (v != IPT_RETURN) { + if (v != XT_RETURN) { verdict = (unsigned)(-v) - 1; break; } @@ -421,7 +421,7 @@ ipt_do_table(struct sk_buff *skb, verdict = t->u.kernel.target->target(skb, &acpar); /* Target might have changed stuff. */ ip = ip_hdr(skb); - if (verdict == IPT_CONTINUE) + if (verdict == XT_CONTINUE) e = ipt_next_entry(e); else /* Verdict */ @@ -461,7 +461,7 @@ mark_source_chains(const struct xt_table_info *newinfo, e->counters.pcnt = pos; for (;;) { - const struct ipt_standard_target *t + const struct xt_standard_target *t = (void *)ipt_get_target_c(e); int visited = e->comefrom & (1 << hook); @@ -475,13 +475,13 @@ mark_source_chains(const struct xt_table_info *newinfo, /* Unconditional return/END. */ if ((e->target_offset == sizeof(struct ipt_entry) && (strcmp(t->target.u.user.name, - IPT_STANDARD_TARGET) == 0) && + XT_STANDARD_TARGET) == 0) && t->verdict < 0 && unconditional(&e->ip)) || visited) { unsigned int oldpos, size; if ((strcmp(t->target.u.user.name, - IPT_STANDARD_TARGET) == 0) && + XT_STANDARD_TARGET) == 0) && t->verdict < -NF_MAX_VERDICT - 1) { duprintf("mark_source_chains: bad " "negative verdict (%i)\n", @@ -524,7 +524,7 @@ mark_source_chains(const struct xt_table_info *newinfo, int newpos = t->verdict; if (strcmp(t->target.u.user.name, - IPT_STANDARD_TARGET) == 0 && + XT_STANDARD_TARGET) == 0 && newpos >= 0) { if (newpos > newinfo->size - sizeof(struct ipt_entry)) { @@ -552,7 +552,7 @@ mark_source_chains(const struct xt_table_info *newinfo, return 1; } -static void cleanup_match(struct ipt_entry_match *m, struct net *net) +static void cleanup_match(struct xt_entry_match *m, struct net *net) { struct xt_mtdtor_param par; @@ -568,14 +568,14 @@ static void cleanup_match(struct ipt_entry_match *m, struct net *net) static int check_entry(const struct ipt_entry *e, const char *name) { - const struct ipt_entry_target *t; + const struct xt_entry_target *t; if (!ip_checkentry(&e->ip)) { duprintf("ip check failed %p %s.\n", e, par->match->name); return -EINVAL; } - if (e->target_offset + sizeof(struct ipt_entry_target) > + if (e->target_offset + sizeof(struct xt_entry_target) > e->next_offset) return -EINVAL; @@ -587,7 +587,7 @@ check_entry(const struct ipt_entry *e, const char *name) } static int -check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par) +check_match(struct xt_entry_match *m, struct xt_mtchk_param *par) { const struct ipt_ip *ip = par->entryinfo; int ret; @@ -605,7 +605,7 @@ check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par) } static int -find_check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par) +find_check_match(struct xt_entry_match *m, struct xt_mtchk_param *par) { struct xt_match *match; int ret; @@ -630,7 +630,7 @@ err: static int check_target(struct ipt_entry *e, struct net *net, const char *name) { - struct ipt_entry_target *t = ipt_get_target(e); + struct xt_entry_target *t = ipt_get_target(e); struct xt_tgchk_param par = { .net = net, .table = name, @@ -656,7 +656,7 @@ static int find_check_entry(struct ipt_entry *e, struct net *net, const char *name, unsigned int size) { - struct ipt_entry_target *t; + struct xt_entry_target *t; struct xt_target *target; int ret; unsigned int j; @@ -707,7 +707,7 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name, static bool check_underflow(const struct ipt_entry *e) { - const struct ipt_entry_target *t; + const struct xt_entry_target *t; unsigned int verdict; if (!unconditional(&e->ip)) @@ -715,7 +715,7 @@ static bool check_underflow(const struct ipt_entry *e) t = ipt_get_target_c(e); if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0) return false; - verdict = ((struct ipt_standard_target *)t)->verdict; + verdict = ((struct xt_standard_target *)t)->verdict; verdict = -verdict - 1; return verdict == NF_DROP || verdict == NF_ACCEPT; } @@ -738,7 +738,7 @@ check_entry_size_and_hooks(struct ipt_entry *e, } if (e->next_offset - < sizeof(struct ipt_entry) + sizeof(struct ipt_entry_target)) { + < sizeof(struct ipt_entry) + sizeof(struct xt_entry_target)) { duprintf("checking: element %p size %u\n", e, e->next_offset); return -EINVAL; @@ -771,7 +771,7 @@ static void cleanup_entry(struct ipt_entry *e, struct net *net) { struct xt_tgdtor_param par; - struct ipt_entry_target *t; + struct xt_entry_target *t; struct xt_entry_match *ematch; /* Cleanup all matches */ @@ -884,7 +884,7 @@ get_counters(const struct xt_table_info *t, struct ipt_entry *iter; unsigned int cpu; unsigned int i; - unsigned int curcpu; + unsigned int curcpu = get_cpu(); /* Instead of clearing (by a previous call to memset()) * the counters and using adds, we set the counters @@ -894,19 +894,22 @@ get_counters(const struct xt_table_info *t, * if new softirq were to run and call ipt_do_table */ local_bh_disable(); - curcpu = smp_processor_id(); - i = 0; xt_entry_foreach(iter, t->entries[curcpu], t->size) { SET_COUNTER(counters[i], iter->counters.bcnt, iter->counters.pcnt); ++i; } + local_bh_enable(); + /* Processing counters from other cpus, we can let bottom half enabled, + * (preemption is disabled) + */ for_each_possible_cpu(cpu) { if (cpu == curcpu) continue; i = 0; + local_bh_disable(); xt_info_wrlock(cpu); xt_entry_foreach(iter, t->entries[cpu], t->size) { ADD_COUNTER(counters[i], iter->counters.bcnt, @@ -914,8 +917,9 @@ get_counters(const struct xt_table_info *t, ++i; /* macro does multi eval of i */ } xt_info_wrunlock(cpu); + local_bh_enable(); } - local_bh_enable(); + put_cpu(); } static struct xt_counters *alloc_counters(const struct xt_table *table) @@ -928,7 +932,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table) (other than comefrom, which userspace doesn't care about). */ countersize = sizeof(struct xt_counters) * private->number; - counters = vmalloc_node(countersize, numa_node_id()); + counters = vmalloc(countersize); if (counters == NULL) return ERR_PTR(-ENOMEM); @@ -968,8 +972,8 @@ copy_entries_to_user(unsigned int total_size, /* ... then go back and fix counters and names */ for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){ unsigned int i; - const struct ipt_entry_match *m; - const struct ipt_entry_target *t; + const struct xt_entry_match *m; + const struct xt_entry_target *t; e = (struct ipt_entry *)(loc_cpu_entry + off); if (copy_to_user(userptr + off @@ -986,7 +990,7 @@ copy_entries_to_user(unsigned int total_size, m = (void *)e + i; if (copy_to_user(userptr + off + i - + offsetof(struct ipt_entry_match, + + offsetof(struct xt_entry_match, u.user.name), m->u.kernel.match->name, strlen(m->u.kernel.match->name)+1) @@ -998,7 +1002,7 @@ copy_entries_to_user(unsigned int total_size, t = ipt_get_target_c(e); if (copy_to_user(userptr + off + e->target_offset - + offsetof(struct ipt_entry_target, + + offsetof(struct xt_entry_target, u.user.name), t->u.kernel.target->name, strlen(t->u.kernel.target->name)+1) != 0) { @@ -1036,7 +1040,7 @@ static int compat_calc_entry(const struct ipt_entry *e, const void *base, struct xt_table_info *newinfo) { const struct xt_entry_match *ematch; - const struct ipt_entry_target *t; + const struct xt_entry_target *t; unsigned int entry_offset; int off, i, ret; @@ -1088,7 +1092,7 @@ static int compat_table_info(const struct xt_table_info *info, static int get_info(struct net *net, void __user *user, const int *len, int compat) { - char name[IPT_TABLE_MAXNAMELEN]; + char name[XT_TABLE_MAXNAMELEN]; struct xt_table *t; int ret; @@ -1101,7 +1105,7 @@ static int get_info(struct net *net, void __user *user, if (copy_from_user(name, user, sizeof(name)) != 0) return -EFAULT; - name[IPT_TABLE_MAXNAMELEN-1] = '\0'; + name[XT_TABLE_MAXNAMELEN-1] = '\0'; #ifdef CONFIG_COMPAT if (compat) xt_compat_lock(AF_INET); @@ -1352,7 +1356,7 @@ do_add_counters(struct net *net, const void __user *user, if (len != size + num_counters * sizeof(struct xt_counters)) return -EINVAL; - paddc = vmalloc_node(len - size, numa_node_id()); + paddc = vmalloc(len - size); if (!paddc) return -ENOMEM; @@ -1396,14 +1400,14 @@ do_add_counters(struct net *net, const void __user *user, #ifdef CONFIG_COMPAT struct compat_ipt_replace { - char name[IPT_TABLE_MAXNAMELEN]; + char name[XT_TABLE_MAXNAMELEN]; u32 valid_hooks; u32 num_entries; u32 size; u32 hook_entry[NF_INET_NUMHOOKS]; u32 underflow[NF_INET_NUMHOOKS]; u32 num_counters; - compat_uptr_t counters; /* struct ipt_counters * */ + compat_uptr_t counters; /* struct xt_counters * */ struct compat_ipt_entry entries[0]; }; @@ -1412,7 +1416,7 @@ compat_copy_entry_to_user(struct ipt_entry *e, void __user **dstptr, unsigned int *size, struct xt_counters *counters, unsigned int i) { - struct ipt_entry_target *t; + struct xt_entry_target *t; struct compat_ipt_entry __user *ce; u_int16_t target_offset, next_offset; compat_uint_t origsize; @@ -1447,7 +1451,7 @@ compat_copy_entry_to_user(struct ipt_entry *e, void __user **dstptr, } static int -compat_find_calc_match(struct ipt_entry_match *m, +compat_find_calc_match(struct xt_entry_match *m, const char *name, const struct ipt_ip *ip, unsigned int hookmask, @@ -1469,7 +1473,7 @@ compat_find_calc_match(struct ipt_entry_match *m, static void compat_release_entry(struct compat_ipt_entry *e) { - struct ipt_entry_target *t; + struct xt_entry_target *t; struct xt_entry_match *ematch; /* Cleanup all matches */ @@ -1490,7 +1494,7 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e, const char *name) { struct xt_entry_match *ematch; - struct ipt_entry_target *t; + struct xt_entry_target *t; struct xt_target *target; unsigned int entry_offset; unsigned int j; @@ -1572,7 +1576,7 @@ compat_copy_entry_from_user(struct compat_ipt_entry *e, void **dstptr, unsigned int *size, const char *name, struct xt_table_info *newinfo, unsigned char *base) { - struct ipt_entry_target *t; + struct xt_entry_target *t; struct xt_target *target; struct ipt_entry *de; unsigned int origsize; @@ -1747,6 +1751,9 @@ translate_compat_table(struct net *net, if (ret != 0) break; ++i; + if (strcmp(ipt_get_target(iter1)->u.user.name, + XT_ERROR_TARGET) == 0) + ++newinfo->stacksize; } if (ret) { /* @@ -1877,7 +1884,7 @@ compat_do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, } struct compat_ipt_get_entries { - char name[IPT_TABLE_MAXNAMELEN]; + char name[XT_TABLE_MAXNAMELEN]; compat_uint_t size; struct compat_ipt_entry entrytable[0]; }; @@ -2032,7 +2039,7 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) case IPT_SO_GET_REVISION_MATCH: case IPT_SO_GET_REVISION_TARGET: { - struct ipt_get_revision rev; + struct xt_get_revision rev; int target; if (*len != sizeof(rev)) { @@ -2169,7 +2176,7 @@ static int icmp_checkentry(const struct xt_mtchk_param *par) static struct xt_target ipt_builtin_tg[] __read_mostly = { { - .name = IPT_STANDARD_TARGET, + .name = XT_STANDARD_TARGET, .targetsize = sizeof(int), .family = NFPROTO_IPV4, #ifdef CONFIG_COMPAT @@ -2179,9 +2186,9 @@ static struct xt_target ipt_builtin_tg[] __read_mostly = { #endif }, { - .name = IPT_ERROR_TARGET, + .name = XT_ERROR_TARGET, .target = ipt_error, - .targetsize = IPT_FUNCTION_MAXNAMELEN, + .targetsize = XT_FUNCTION_MAXNAMELEN, .family = NFPROTO_IPV4, }, }; diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index f91c94b9a790..1e26a4897655 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -29,6 +29,7 @@ #include <net/netfilter/nf_conntrack.h> #include <net/net_namespace.h> #include <net/checksum.h> +#include <net/ip.h> #define CLUSTERIP_VERSION "0.8" @@ -53,12 +54,13 @@ struct clusterip_config { #endif enum clusterip_hashmode hash_mode; /* which hashing mode */ u_int32_t hash_initval; /* hash initialization */ + struct rcu_head rcu; }; static LIST_HEAD(clusterip_configs); /* clusterip_lock protects the clusterip_configs list */ -static DEFINE_RWLOCK(clusterip_lock); +static DEFINE_SPINLOCK(clusterip_lock); #ifdef CONFIG_PROC_FS static const struct file_operations clusterip_proc_fops; @@ -71,11 +73,17 @@ clusterip_config_get(struct clusterip_config *c) atomic_inc(&c->refcount); } + +static void clusterip_config_rcu_free(struct rcu_head *head) +{ + kfree(container_of(head, struct clusterip_config, rcu)); +} + static inline void clusterip_config_put(struct clusterip_config *c) { if (atomic_dec_and_test(&c->refcount)) - kfree(c); + call_rcu_bh(&c->rcu, clusterip_config_rcu_free); } /* decrease the count of entries using/referencing this config. If last @@ -84,10 +92,11 @@ clusterip_config_put(struct clusterip_config *c) static inline void clusterip_config_entry_put(struct clusterip_config *c) { - write_lock_bh(&clusterip_lock); - if (atomic_dec_and_test(&c->entries)) { - list_del(&c->list); - write_unlock_bh(&clusterip_lock); + local_bh_disable(); + if (atomic_dec_and_lock(&c->entries, &clusterip_lock)) { + list_del_rcu(&c->list); + spin_unlock(&clusterip_lock); + local_bh_enable(); dev_mc_del(c->dev, c->clustermac); dev_put(c->dev); @@ -100,7 +109,7 @@ clusterip_config_entry_put(struct clusterip_config *c) #endif return; } - write_unlock_bh(&clusterip_lock); + local_bh_enable(); } static struct clusterip_config * @@ -108,7 +117,7 @@ __clusterip_config_find(__be32 clusterip) { struct clusterip_config *c; - list_for_each_entry(c, &clusterip_configs, list) { + list_for_each_entry_rcu(c, &clusterip_configs, list) { if (c->clusterip == clusterip) return c; } @@ -121,16 +130,15 @@ clusterip_config_find_get(__be32 clusterip, int entry) { struct clusterip_config *c; - read_lock_bh(&clusterip_lock); + rcu_read_lock_bh(); c = __clusterip_config_find(clusterip); - if (!c) { - read_unlock_bh(&clusterip_lock); - return NULL; + if (c) { + if (unlikely(!atomic_inc_not_zero(&c->refcount))) + c = NULL; + else if (entry) + atomic_inc(&c->entries); } - atomic_inc(&c->refcount); - if (entry) - atomic_inc(&c->entries); - read_unlock_bh(&clusterip_lock); + rcu_read_unlock_bh(); return c; } @@ -181,9 +189,9 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip, } #endif - write_lock_bh(&clusterip_lock); - list_add(&c->list, &clusterip_configs); - write_unlock_bh(&clusterip_lock); + spin_lock_bh(&clusterip_lock); + list_add_rcu(&c->list, &clusterip_configs); + spin_unlock_bh(&clusterip_lock); return c; } @@ -224,24 +232,22 @@ clusterip_hashfn(const struct sk_buff *skb, { const struct iphdr *iph = ip_hdr(skb); unsigned long hashval; - u_int16_t sport, dport; - const u_int16_t *ports; - - switch (iph->protocol) { - case IPPROTO_TCP: - case IPPROTO_UDP: - case IPPROTO_UDPLITE: - case IPPROTO_SCTP: - case IPPROTO_DCCP: - case IPPROTO_ICMP: - ports = (const void *)iph+iph->ihl*4; - sport = ports[0]; - dport = ports[1]; - break; - default: + u_int16_t sport = 0, dport = 0; + int poff; + + poff = proto_ports_offset(iph->protocol); + if (poff >= 0) { + const u_int16_t *ports; + u16 _ports[2]; + + ports = skb_header_pointer(skb, iph->ihl * 4 + poff, 4, _ports); + if (ports) { + sport = ports[0]; + dport = ports[1]; + } + } else { if (net_ratelimit()) pr_info("unknown protocol %u\n", iph->protocol); - sport = dport = 0; } switch (config->hash_mode) { @@ -462,7 +468,7 @@ struct arp_payload { __be32 src_ip; u_int8_t dst_hw[ETH_ALEN]; __be32 dst_ip; -} __attribute__ ((packed)); +} __packed; #ifdef DEBUG static void arp_print(struct arp_payload *payload) @@ -733,6 +739,9 @@ static void __exit clusterip_tg_exit(void) #endif nf_unregister_hook(&cip_arp_ops); xt_unregister_target(&clusterip_tg_reg); + + /* Wait for completion of call_rcu_bh()'s (clusterip_config_rcu_free) */ + rcu_barrier_bh(); } module_init(clusterip_tg_init); diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c index 5234f4f3499a..72ffc8fda2e9 100644 --- a/net/ipv4/netfilter/ipt_LOG.c +++ b/net/ipv4/netfilter/ipt_LOG.c @@ -13,6 +13,7 @@ #include <linux/module.h> #include <linux/spinlock.h> #include <linux/skbuff.h> +#include <linux/if_arp.h> #include <linux/ip.h> #include <net/icmp.h> #include <net/udp.h> @@ -23,16 +24,15 @@ #include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv4/ipt_LOG.h> #include <net/netfilter/nf_log.h> +#include <net/netfilter/xt_log.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); MODULE_DESCRIPTION("Xtables: IPv4 packet logging to syslog"); -/* Use lock to serialize, so printks don't overlap */ -static DEFINE_SPINLOCK(log_lock); - /* One level of recursion won't kill us */ -static void dump_packet(const struct nf_loginfo *info, +static void dump_packet(struct sbuff *m, + const struct nf_loginfo *info, const struct sk_buff *skb, unsigned int iphoff) { @@ -47,32 +47,32 @@ static void dump_packet(const struct nf_loginfo *info, ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph); if (ih == NULL) { - printk("TRUNCATED"); + sb_add(m, "TRUNCATED"); return; } /* Important fields: * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */ /* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */ - printk("SRC=%pI4 DST=%pI4 ", + sb_add(m, "SRC=%pI4 DST=%pI4 ", &ih->saddr, &ih->daddr); /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */ - printk("LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ", + sb_add(m, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ", ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK, ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id)); /* Max length: 6 "CE DF MF " */ if (ntohs(ih->frag_off) & IP_CE) - printk("CE "); + sb_add(m, "CE "); if (ntohs(ih->frag_off) & IP_DF) - printk("DF "); + sb_add(m, "DF "); if (ntohs(ih->frag_off) & IP_MF) - printk("MF "); + sb_add(m, "MF "); /* Max length: 11 "FRAG:65535 " */ if (ntohs(ih->frag_off) & IP_OFFSET) - printk("FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET); + sb_add(m, "FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET); if ((logflags & IPT_LOG_IPOPT) && ih->ihl * 4 > sizeof(struct iphdr)) { @@ -84,15 +84,15 @@ static void dump_packet(const struct nf_loginfo *info, op = skb_header_pointer(skb, iphoff+sizeof(_iph), optsize, _opt); if (op == NULL) { - printk("TRUNCATED"); + sb_add(m, "TRUNCATED"); return; } /* Max length: 127 "OPT (" 15*4*2chars ") " */ - printk("OPT ("); + sb_add(m, "OPT ("); for (i = 0; i < optsize; i++) - printk("%02X", op[i]); - printk(") "); + sb_add(m, "%02X", op[i]); + sb_add(m, ") "); } switch (ih->protocol) { @@ -101,7 +101,7 @@ static void dump_packet(const struct nf_loginfo *info, const struct tcphdr *th; /* Max length: 10 "PROTO=TCP " */ - printk("PROTO=TCP "); + sb_add(m, "PROTO=TCP "); if (ntohs(ih->frag_off) & IP_OFFSET) break; @@ -110,41 +110,41 @@ static void dump_packet(const struct nf_loginfo *info, th = skb_header_pointer(skb, iphoff + ih->ihl * 4, sizeof(_tcph), &_tcph); if (th == NULL) { - printk("INCOMPLETE [%u bytes] ", + sb_add(m, "INCOMPLETE [%u bytes] ", skb->len - iphoff - ih->ihl*4); break; } /* Max length: 20 "SPT=65535 DPT=65535 " */ - printk("SPT=%u DPT=%u ", + sb_add(m, "SPT=%u DPT=%u ", ntohs(th->source), ntohs(th->dest)); /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */ if (logflags & IPT_LOG_TCPSEQ) - printk("SEQ=%u ACK=%u ", + sb_add(m, "SEQ=%u ACK=%u ", ntohl(th->seq), ntohl(th->ack_seq)); /* Max length: 13 "WINDOW=65535 " */ - printk("WINDOW=%u ", ntohs(th->window)); + sb_add(m, "WINDOW=%u ", ntohs(th->window)); /* Max length: 9 "RES=0x3F " */ - printk("RES=0x%02x ", (u8)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22)); + sb_add(m, "RES=0x%02x ", (u8)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22)); /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */ if (th->cwr) - printk("CWR "); + sb_add(m, "CWR "); if (th->ece) - printk("ECE "); + sb_add(m, "ECE "); if (th->urg) - printk("URG "); + sb_add(m, "URG "); if (th->ack) - printk("ACK "); + sb_add(m, "ACK "); if (th->psh) - printk("PSH "); + sb_add(m, "PSH "); if (th->rst) - printk("RST "); + sb_add(m, "RST "); if (th->syn) - printk("SYN "); + sb_add(m, "SYN "); if (th->fin) - printk("FIN "); + sb_add(m, "FIN "); /* Max length: 11 "URGP=65535 " */ - printk("URGP=%u ", ntohs(th->urg_ptr)); + sb_add(m, "URGP=%u ", ntohs(th->urg_ptr)); if ((logflags & IPT_LOG_TCPOPT) && th->doff * 4 > sizeof(struct tcphdr)) { @@ -157,15 +157,15 @@ static void dump_packet(const struct nf_loginfo *info, iphoff+ih->ihl*4+sizeof(_tcph), optsize, _opt); if (op == NULL) { - printk("TRUNCATED"); + sb_add(m, "TRUNCATED"); return; } /* Max length: 127 "OPT (" 15*4*2chars ") " */ - printk("OPT ("); + sb_add(m, "OPT ("); for (i = 0; i < optsize; i++) - printk("%02X", op[i]); - printk(") "); + sb_add(m, "%02X", op[i]); + sb_add(m, ") "); } break; } @@ -176,9 +176,9 @@ static void dump_packet(const struct nf_loginfo *info, if (ih->protocol == IPPROTO_UDP) /* Max length: 10 "PROTO=UDP " */ - printk("PROTO=UDP " ); + sb_add(m, "PROTO=UDP " ); else /* Max length: 14 "PROTO=UDPLITE " */ - printk("PROTO=UDPLITE "); + sb_add(m, "PROTO=UDPLITE "); if (ntohs(ih->frag_off) & IP_OFFSET) break; @@ -187,13 +187,13 @@ static void dump_packet(const struct nf_loginfo *info, uh = skb_header_pointer(skb, iphoff+ih->ihl*4, sizeof(_udph), &_udph); if (uh == NULL) { - printk("INCOMPLETE [%u bytes] ", + sb_add(m, "INCOMPLETE [%u bytes] ", skb->len - iphoff - ih->ihl*4); break; } /* Max length: 20 "SPT=65535 DPT=65535 " */ - printk("SPT=%u DPT=%u LEN=%u ", + sb_add(m, "SPT=%u DPT=%u LEN=%u ", ntohs(uh->source), ntohs(uh->dest), ntohs(uh->len)); break; @@ -220,7 +220,7 @@ static void dump_packet(const struct nf_loginfo *info, [ICMP_ADDRESSREPLY] = 12 }; /* Max length: 11 "PROTO=ICMP " */ - printk("PROTO=ICMP "); + sb_add(m, "PROTO=ICMP "); if (ntohs(ih->frag_off) & IP_OFFSET) break; @@ -229,19 +229,19 @@ static void dump_packet(const struct nf_loginfo *info, ich = skb_header_pointer(skb, iphoff + ih->ihl * 4, sizeof(_icmph), &_icmph); if (ich == NULL) { - printk("INCOMPLETE [%u bytes] ", + sb_add(m, "INCOMPLETE [%u bytes] ", skb->len - iphoff - ih->ihl*4); break; } /* Max length: 18 "TYPE=255 CODE=255 " */ - printk("TYPE=%u CODE=%u ", ich->type, ich->code); + sb_add(m, "TYPE=%u CODE=%u ", ich->type, ich->code); /* Max length: 25 "INCOMPLETE [65535 bytes] " */ if (ich->type <= NR_ICMP_TYPES && required_len[ich->type] && skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) { - printk("INCOMPLETE [%u bytes] ", + sb_add(m, "INCOMPLETE [%u bytes] ", skb->len - iphoff - ih->ihl*4); break; } @@ -250,35 +250,35 @@ static void dump_packet(const struct nf_loginfo *info, case ICMP_ECHOREPLY: case ICMP_ECHO: /* Max length: 19 "ID=65535 SEQ=65535 " */ - printk("ID=%u SEQ=%u ", + sb_add(m, "ID=%u SEQ=%u ", ntohs(ich->un.echo.id), ntohs(ich->un.echo.sequence)); break; case ICMP_PARAMETERPROB: /* Max length: 14 "PARAMETER=255 " */ - printk("PARAMETER=%u ", + sb_add(m, "PARAMETER=%u ", ntohl(ich->un.gateway) >> 24); break; case ICMP_REDIRECT: /* Max length: 24 "GATEWAY=255.255.255.255 " */ - printk("GATEWAY=%pI4 ", &ich->un.gateway); + sb_add(m, "GATEWAY=%pI4 ", &ich->un.gateway); /* Fall through */ case ICMP_DEST_UNREACH: case ICMP_SOURCE_QUENCH: case ICMP_TIME_EXCEEDED: /* Max length: 3+maxlen */ if (!iphoff) { /* Only recurse once. */ - printk("["); - dump_packet(info, skb, + sb_add(m, "["); + dump_packet(m, info, skb, iphoff + ih->ihl*4+sizeof(_icmph)); - printk("] "); + sb_add(m, "] "); } /* Max length: 10 "MTU=65535 " */ if (ich->type == ICMP_DEST_UNREACH && ich->code == ICMP_FRAG_NEEDED) - printk("MTU=%u ", ntohs(ich->un.frag.mtu)); + sb_add(m, "MTU=%u ", ntohs(ich->un.frag.mtu)); } break; } @@ -291,19 +291,19 @@ static void dump_packet(const struct nf_loginfo *info, break; /* Max length: 9 "PROTO=AH " */ - printk("PROTO=AH "); + sb_add(m, "PROTO=AH "); /* Max length: 25 "INCOMPLETE [65535 bytes] " */ ah = skb_header_pointer(skb, iphoff+ih->ihl*4, sizeof(_ahdr), &_ahdr); if (ah == NULL) { - printk("INCOMPLETE [%u bytes] ", + sb_add(m, "INCOMPLETE [%u bytes] ", skb->len - iphoff - ih->ihl*4); break; } /* Length: 15 "SPI=0xF1234567 " */ - printk("SPI=0x%x ", ntohl(ah->spi)); + sb_add(m, "SPI=0x%x ", ntohl(ah->spi)); break; } case IPPROTO_ESP: { @@ -311,7 +311,7 @@ static void dump_packet(const struct nf_loginfo *info, const struct ip_esp_hdr *eh; /* Max length: 10 "PROTO=ESP " */ - printk("PROTO=ESP "); + sb_add(m, "PROTO=ESP "); if (ntohs(ih->frag_off) & IP_OFFSET) break; @@ -320,25 +320,25 @@ static void dump_packet(const struct nf_loginfo *info, eh = skb_header_pointer(skb, iphoff+ih->ihl*4, sizeof(_esph), &_esph); if (eh == NULL) { - printk("INCOMPLETE [%u bytes] ", + sb_add(m, "INCOMPLETE [%u bytes] ", skb->len - iphoff - ih->ihl*4); break; } /* Length: 15 "SPI=0xF1234567 " */ - printk("SPI=0x%x ", ntohl(eh->spi)); + sb_add(m, "SPI=0x%x ", ntohl(eh->spi)); break; } /* Max length: 10 "PROTO 255 " */ default: - printk("PROTO=%u ", ih->protocol); + sb_add(m, "PROTO=%u ", ih->protocol); } /* Max length: 15 "UID=4294967295 " */ if ((logflags & IPT_LOG_UID) && !iphoff && skb->sk) { read_lock_bh(&skb->sk->sk_callback_lock); if (skb->sk->sk_socket && skb->sk->sk_socket->file) - printk("UID=%u GID=%u ", + sb_add(m, "UID=%u GID=%u ", skb->sk->sk_socket->file->f_cred->fsuid, skb->sk->sk_socket->file->f_cred->fsgid); read_unlock_bh(&skb->sk->sk_callback_lock); @@ -346,7 +346,7 @@ static void dump_packet(const struct nf_loginfo *info, /* Max length: 16 "MARK=0xFFFFFFFF " */ if (!iphoff && skb->mark) - printk("MARK=0x%x ", skb->mark); + sb_add(m, "MARK=0x%x ", skb->mark); /* Proto Max log string length */ /* IP: 40+46+6+11+127 = 230 */ @@ -363,6 +363,43 @@ static void dump_packet(const struct nf_loginfo *info, /* maxlen = 230+ 91 + 230 + 252 = 803 */ } +static void dump_mac_header(struct sbuff *m, + const struct nf_loginfo *info, + const struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + unsigned int logflags = 0; + + if (info->type == NF_LOG_TYPE_LOG) + logflags = info->u.log.logflags; + + if (!(logflags & IPT_LOG_MACDECODE)) + goto fallback; + + switch (dev->type) { + case ARPHRD_ETHER: + sb_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ", + eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest, + ntohs(eth_hdr(skb)->h_proto)); + return; + default: + break; + } + +fallback: + sb_add(m, "MAC="); + if (dev->hard_header_len && + skb->mac_header != skb->network_header) { + const unsigned char *p = skb_mac_header(skb); + unsigned int i; + + sb_add(m, "%02x", *p++); + for (i = 1; i < dev->hard_header_len; i++, p++) + sb_add(m, ":%02x", *p); + } + sb_add(m, " "); +} + static struct nf_loginfo default_loginfo = { .type = NF_LOG_TYPE_LOG, .u = { @@ -382,11 +419,12 @@ ipt_log_packet(u_int8_t pf, const struct nf_loginfo *loginfo, const char *prefix) { + struct sbuff *m = sb_open(); + if (!loginfo) loginfo = &default_loginfo; - spin_lock_bh(&log_lock); - printk("<%d>%sIN=%s OUT=%s ", loginfo->u.log.level, + sb_add(m, "<%d>%sIN=%s OUT=%s ", loginfo->u.log.level, prefix, in ? in->name : "", out ? out->name : ""); @@ -397,31 +435,20 @@ ipt_log_packet(u_int8_t pf, physindev = skb->nf_bridge->physindev; if (physindev && in != physindev) - printk("PHYSIN=%s ", physindev->name); + sb_add(m, "PHYSIN=%s ", physindev->name); physoutdev = skb->nf_bridge->physoutdev; if (physoutdev && out != physoutdev) - printk("PHYSOUT=%s ", physoutdev->name); + sb_add(m, "PHYSOUT=%s ", physoutdev->name); } #endif - if (in && !out) { - /* MAC logging for input chain only. */ - printk("MAC="); - if (skb->dev && skb->dev->hard_header_len && - skb->mac_header != skb->network_header) { - int i; - const unsigned char *p = skb_mac_header(skb); - for (i = 0; i < skb->dev->hard_header_len; i++,p++) - printk("%02x%c", *p, - i==skb->dev->hard_header_len - 1 - ? ' ':':'); - } else - printk(" "); - } + /* MAC logging for input path only. */ + if (in && !out) + dump_mac_header(m, loginfo, skb); + + dump_packet(m, loginfo, skb, 0); - dump_packet(loginfo, skb, 0); - printk("\n"); - spin_unlock_bh(&log_lock); + sb_close(m); } static unsigned int diff --git a/net/ipv4/netfilter/ipt_NETMAP.c b/net/ipv4/netfilter/ipt_NETMAP.c index f43867d1697f..6cdb298f1035 100644 --- a/net/ipv4/netfilter/ipt_NETMAP.c +++ b/net/ipv4/netfilter/ipt_NETMAP.c @@ -48,7 +48,8 @@ netmap_tg(struct sk_buff *skb, const struct xt_action_param *par) NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING || par->hooknum == NF_INET_POST_ROUTING || - par->hooknum == NF_INET_LOCAL_OUT); + par->hooknum == NF_INET_LOCAL_OUT || + par->hooknum == NF_INET_LOCAL_IN); ct = nf_ct_get(skb, &ctinfo); netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip); @@ -77,7 +78,8 @@ static struct xt_target netmap_tg_reg __read_mostly = { .table = "nat", .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_POST_ROUTING) | - (1 << NF_INET_LOCAL_OUT), + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_LOCAL_IN), .checkentry = netmap_tg_check, .me = THIS_MODULE }; diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c index f5f4a888e4ec..43eec80c0e7c 100644 --- a/net/ipv4/netfilter/ipt_REJECT.c +++ b/net/ipv4/netfilter/ipt_REJECT.c @@ -95,10 +95,11 @@ static void send_reset(struct sk_buff *oldskb, int hook) } tcph->rst = 1; - tcph->check = tcp_v4_check(sizeof(struct tcphdr), - niph->saddr, niph->daddr, - csum_partial(tcph, - sizeof(struct tcphdr), 0)); + tcph->check = ~tcp_v4_check(sizeof(struct tcphdr), niph->saddr, + niph->daddr, 0); + nskb->ip_summed = CHECKSUM_PARTIAL; + nskb->csum_start = (unsigned char *)tcph - nskb->head; + nskb->csum_offset = offsetof(struct tcphdr, check); addr_type = RTN_UNSPEC; if (hook != NF_INET_FORWARD @@ -109,13 +110,13 @@ static void send_reset(struct sk_buff *oldskb, int hook) addr_type = RTN_LOCAL; /* ip_route_me_harder expects skb->dst to be set */ - skb_dst_set(nskb, dst_clone(skb_dst(oldskb))); + skb_dst_set_noref(nskb, skb_dst(oldskb)); + nskb->protocol = htons(ETH_P_IP); if (ip_route_me_harder(nskb, addr_type)) goto free_nskb; niph->ttl = dst_metric(skb_dst(nskb), RTAX_HOPLIMIT); - nskb->ip_summed = CHECKSUM_NONE; /* "Never happens" */ if (nskb->len > dst_mtu(skb_dst(nskb))) diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c index 244f7cb08d68..37f8adb68c79 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c @@ -11,6 +11,7 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/percpu.h> +#include <linux/security.h> #include <net/net_namespace.h> #include <linux/netfilter.h> @@ -87,6 +88,29 @@ static void ct_seq_stop(struct seq_file *s, void *v) rcu_read_unlock(); } +#ifdef CONFIG_NF_CONNTRACK_SECMARK +static int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct) +{ + int ret; + u32 len; + char *secctx; + + ret = security_secid_to_secctx(ct->secmark, &secctx, &len); + if (ret) + return ret; + + ret = seq_printf(s, "secctx=%s ", secctx); + + security_release_secctx(secctx, len); + return ret; +} +#else +static inline int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct) +{ + return 0; +} +#endif + static int ct_seq_show(struct seq_file *s, void *v) { struct nf_conntrack_tuple_hash *hash = v; @@ -148,10 +172,8 @@ static int ct_seq_show(struct seq_file *s, void *v) goto release; #endif -#ifdef CONFIG_NF_CONNTRACK_SECMARK - if (seq_printf(s, "secmark=%u ", ct->secmark)) + if (ct_show_secctx(s, ct)) goto release; -#endif if (seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use))) goto release; diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index cb763ae9ed90..f3a9b42b16c6 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -66,6 +66,13 @@ static unsigned int ipv4_conntrack_defrag(unsigned int hooknum, const struct net_device *out, int (*okfn)(struct sk_buff *)) { + struct sock *sk = skb->sk; + struct inet_sock *inet = inet_sk(skb->sk); + + if (sk && (sk->sk_family == PF_INET) && + inet->nodefrag) + return NF_ACCEPT; + #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) #if !defined(CONFIG_NF_NAT) && !defined(CONFIG_NF_NAT_MODULE) /* Previously seen (loopback)? Ignore. Do this before diff --git a/net/ipv4/netfilter/nf_nat_amanda.c b/net/ipv4/netfilter/nf_nat_amanda.c index c31b87668250..0f23b3f06df0 100644 --- a/net/ipv4/netfilter/nf_nat_amanda.c +++ b/net/ipv4/netfilter/nf_nat_amanda.c @@ -44,9 +44,16 @@ static unsigned int help(struct sk_buff *skb, /* Try to get same port: if not, try to change it. */ for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { + int ret; + exp->tuple.dst.u.tcp.port = htons(port); - if (nf_ct_expect_related(exp) == 0) + ret = nf_ct_expect_related(exp); + if (ret == 0) + break; + else if (ret != -EBUSY) { + port = 0; break; + } } if (port == 0) diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c index 4f8bddb760c9..295c97431e43 100644 --- a/net/ipv4/netfilter/nf_nat_core.c +++ b/net/ipv4/netfilter/nf_nat_core.c @@ -38,7 +38,7 @@ static DEFINE_SPINLOCK(nf_nat_lock); static struct nf_conntrack_l3proto *l3proto __read_mostly; #define MAX_IP_NAT_PROTO 256 -static const struct nf_nat_protocol *nf_nat_protos[MAX_IP_NAT_PROTO] +static const struct nf_nat_protocol __rcu *nf_nat_protos[MAX_IP_NAT_PROTO] __read_mostly; static inline const struct nf_nat_protocol * @@ -47,7 +47,7 @@ __nf_nat_proto_find(u_int8_t protonum) return rcu_dereference(nf_nat_protos[protonum]); } -const struct nf_nat_protocol * +static const struct nf_nat_protocol * nf_nat_proto_find_get(u_int8_t protonum) { const struct nf_nat_protocol *p; @@ -60,14 +60,12 @@ nf_nat_proto_find_get(u_int8_t protonum) return p; } -EXPORT_SYMBOL_GPL(nf_nat_proto_find_get); -void +static void nf_nat_proto_put(const struct nf_nat_protocol *p) { module_put(p->me); } -EXPORT_SYMBOL_GPL(nf_nat_proto_put); /* We keep an extra hash for each conntrack, for fast searching. */ static inline unsigned int @@ -261,17 +259,18 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple, rcu_read_lock(); proto = __nf_nat_proto_find(orig_tuple->dst.protonum); - /* Change protocol info to have some randomization */ - if (range->flags & IP_NAT_RANGE_PROTO_RANDOM) { - proto->unique_tuple(tuple, range, maniptype, ct); - goto out; - } - /* Only bother mapping if it's not already in range and unique */ - if ((!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) || - proto->in_range(tuple, maniptype, &range->min, &range->max)) && - !nf_nat_used_tuple(tuple, ct)) - goto out; + if (!(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) { + if (range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) { + if (proto->in_range(tuple, maniptype, &range->min, + &range->max) && + (range->min.all == range->max.all || + !nf_nat_used_tuple(tuple, ct))) + goto out; + } else if (!nf_nat_used_tuple(tuple, ct)) { + goto out; + } + } /* Last change: get protocol to try to obtain unique tuple. */ proto->unique_tuple(tuple, range, maniptype, ct); @@ -440,7 +439,7 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct, if (!skb_make_writable(skb, hdrlen + sizeof(*inside))) return 0; - inside = (void *)skb->data + ip_hdrlen(skb); + inside = (void *)skb->data + hdrlen; /* We're actually going to mangle it beyond trivial checksum adjustment, so make sure the current checksum is correct. */ @@ -463,6 +462,18 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct, return 0; } + if (manip == IP_NAT_MANIP_SRC) + statusbit = IPS_SRC_NAT; + else + statusbit = IPS_DST_NAT; + + /* Invert if this is reply dir. */ + if (dir == IP_CT_DIR_REPLY) + statusbit ^= IPS_NAT_MASK; + + if (!(ct->status & statusbit)) + return 1; + pr_debug("icmp_reply_translation: translating error %p manip %u " "dir %s\n", skb, manip, dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY"); @@ -470,12 +481,10 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct, /* rcu_read_lock()ed by nf_hook_slow */ l4proto = __nf_ct_l4proto_find(PF_INET, inside->ip.protocol); - if (!nf_ct_get_tuple(skb, - ip_hdrlen(skb) + sizeof(struct icmphdr), - (ip_hdrlen(skb) + + if (!nf_ct_get_tuple(skb, hdrlen + sizeof(struct icmphdr), + (hdrlen + sizeof(struct icmphdr) + inside->ip.ihl * 4), - (u_int16_t)AF_INET, - inside->ip.protocol, + (u_int16_t)AF_INET, inside->ip.protocol, &inner, l3proto, l4proto)) return 0; @@ -484,15 +493,13 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct, pass all hooks (locally-generated ICMP). Consider incoming packet: PREROUTING (DST manip), routing produces ICMP, goes through POSTROUTING (which must correct the DST manip). */ - if (!manip_pkt(inside->ip.protocol, skb, - ip_hdrlen(skb) + sizeof(inside->icmp), - &ct->tuplehash[!dir].tuple, - !manip)) + if (!manip_pkt(inside->ip.protocol, skb, hdrlen + sizeof(inside->icmp), + &ct->tuplehash[!dir].tuple, !manip)) return 0; if (skb->ip_summed != CHECKSUM_PARTIAL) { /* Reloading "inside" here since manip_pkt inner. */ - inside = (void *)skb->data + ip_hdrlen(skb); + inside = (void *)skb->data + hdrlen; inside->icmp.checksum = 0; inside->icmp.checksum = csum_fold(skb_checksum(skb, hdrlen, @@ -501,20 +508,9 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct, /* Change outer to look the reply to an incoming packet * (proto 0 means don't invert per-proto part). */ - if (manip == IP_NAT_MANIP_SRC) - statusbit = IPS_SRC_NAT; - else - statusbit = IPS_DST_NAT; - - /* Invert if this is reply dir. */ - if (dir == IP_CT_DIR_REPLY) - statusbit ^= IPS_NAT_MASK; - - if (ct->status & statusbit) { - nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple); - if (!manip_pkt(0, skb, 0, &target, manip)) - return 0; - } + nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple); + if (!manip_pkt(0, skb, 0, &target, manip)) + return 0; return 1; } @@ -742,7 +738,7 @@ static int __init nf_nat_init(void) spin_unlock_bh(&nf_nat_lock); /* Initialize fake conntrack so that NAT will skip it */ - nf_conntrack_untracked.status |= IPS_NAT_DONE_MASK; + nf_ct_untracked_status_or(IPS_NAT_DONE_MASK); l3proto = nf_ct_l3proto_find_get((u_int16_t)AF_INET); diff --git a/net/ipv4/netfilter/nf_nat_ftp.c b/net/ipv4/netfilter/nf_nat_ftp.c index 86e0e84ff0a0..dc73abb3fe27 100644 --- a/net/ipv4/netfilter/nf_nat_ftp.c +++ b/net/ipv4/netfilter/nf_nat_ftp.c @@ -79,9 +79,16 @@ static unsigned int nf_nat_ftp(struct sk_buff *skb, /* Try to get same port: if not, try to change it. */ for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { + int ret; + exp->tuple.dst.u.tcp.port = htons(port); - if (nf_ct_expect_related(exp) == 0) + ret = nf_ct_expect_related(exp); + if (ret == 0) + break; + else if (ret != -EBUSY) { + port = 0; break; + } } if (port == 0) diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c index 5045196d853c..790f3160e012 100644 --- a/net/ipv4/netfilter/nf_nat_h323.c +++ b/net/ipv4/netfilter/nf_nat_h323.c @@ -222,13 +222,24 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct, /* Try to get a pair of ports. */ for (nated_port = ntohs(rtp_exp->tuple.dst.u.udp.port); nated_port != 0; nated_port += 2) { + int ret; + rtp_exp->tuple.dst.u.udp.port = htons(nated_port); - if (nf_ct_expect_related(rtp_exp) == 0) { + ret = nf_ct_expect_related(rtp_exp); + if (ret == 0) { rtcp_exp->tuple.dst.u.udp.port = htons(nated_port + 1); - if (nf_ct_expect_related(rtcp_exp) == 0) + ret = nf_ct_expect_related(rtcp_exp); + if (ret == 0) + break; + else if (ret != -EBUSY) { + nf_ct_unexpect_related(rtp_exp); + nated_port = 0; break; - nf_ct_unexpect_related(rtp_exp); + } + } else if (ret != -EBUSY) { + nated_port = 0; + break; } } @@ -284,9 +295,16 @@ static int nat_t120(struct sk_buff *skb, struct nf_conn *ct, /* Try to get same port: if not, try to change it. */ for (; nated_port != 0; nated_port++) { + int ret; + exp->tuple.dst.u.tcp.port = htons(nated_port); - if (nf_ct_expect_related(exp) == 0) + ret = nf_ct_expect_related(exp); + if (ret == 0) + break; + else if (ret != -EBUSY) { + nated_port = 0; break; + } } if (nated_port == 0) { /* No port available */ @@ -334,9 +352,16 @@ static int nat_h245(struct sk_buff *skb, struct nf_conn *ct, /* Try to get same port: if not, try to change it. */ for (; nated_port != 0; nated_port++) { + int ret; + exp->tuple.dst.u.tcp.port = htons(nated_port); - if (nf_ct_expect_related(exp) == 0) + ret = nf_ct_expect_related(exp); + if (ret == 0) break; + else if (ret != -EBUSY) { + nated_port = 0; + break; + } } if (nated_port == 0) { /* No port available */ @@ -418,9 +443,16 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct, /* Try to get same port: if not, try to change it. */ for (; nated_port != 0; nated_port++) { + int ret; + exp->tuple.dst.u.tcp.port = htons(nated_port); - if (nf_ct_expect_related(exp) == 0) + ret = nf_ct_expect_related(exp); + if (ret == 0) + break; + else if (ret != -EBUSY) { + nated_port = 0; break; + } } if (nated_port == 0) { /* No port available */ @@ -500,9 +532,16 @@ static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct, /* Try to get same port: if not, try to change it. */ for (nated_port = ntohs(port); nated_port != 0; nated_port++) { + int ret; + exp->tuple.dst.u.tcp.port = htons(nated_port); - if (nf_ct_expect_related(exp) == 0) + ret = nf_ct_expect_related(exp); + if (ret == 0) break; + else if (ret != -EBUSY) { + nated_port = 0; + break; + } } if (nated_port == 0) { /* No port available */ diff --git a/net/ipv4/netfilter/nf_nat_helper.c b/net/ipv4/netfilter/nf_nat_helper.c index 4a0c6b548eee..31427fb57aa8 100644 --- a/net/ipv4/netfilter/nf_nat_helper.c +++ b/net/ipv4/netfilter/nf_nat_helper.c @@ -153,6 +153,35 @@ void nf_nat_set_seq_adjust(struct nf_conn *ct, enum ip_conntrack_info ctinfo, } EXPORT_SYMBOL_GPL(nf_nat_set_seq_adjust); +static void nf_nat_csum(struct sk_buff *skb, struct iphdr *iph, void *data, + int datalen, __sum16 *check, int oldlen) +{ + struct rtable *rt = skb_rtable(skb); + + if (skb->ip_summed != CHECKSUM_PARTIAL) { + if (!(rt->rt_flags & RTCF_LOCAL) && + skb->dev->features & NETIF_F_V4_CSUM) { + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum_start = skb_headroom(skb) + + skb_network_offset(skb) + + iph->ihl * 4; + skb->csum_offset = (void *)check - data; + *check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, + datalen, iph->protocol, 0); + } else { + *check = 0; + *check = csum_tcpudp_magic(iph->saddr, iph->daddr, + datalen, iph->protocol, + csum_partial(data, datalen, + 0)); + if (iph->protocol == IPPROTO_UDP && !*check) + *check = CSUM_MANGLED_0; + } + } else + inet_proto_csum_replace2(check, skb, + htons(oldlen), htons(datalen), 1); +} + /* Generic function for mangling variable-length address changes inside * NATed TCP connections (like the PORT XXX,XXX,XXX,XXX,XXX,XXX * command in FTP). @@ -169,7 +198,6 @@ int __nf_nat_mangle_tcp_packet(struct sk_buff *skb, const char *rep_buffer, unsigned int rep_len, bool adjust) { - struct rtable *rt = skb_rtable(skb); struct iphdr *iph; struct tcphdr *tcph; int oldlen, datalen; @@ -192,26 +220,7 @@ int __nf_nat_mangle_tcp_packet(struct sk_buff *skb, match_offset, match_len, rep_buffer, rep_len); datalen = skb->len - iph->ihl*4; - if (skb->ip_summed != CHECKSUM_PARTIAL) { - if (!(rt->rt_flags & RTCF_LOCAL) && - skb->dev->features & NETIF_F_V4_CSUM) { - skb->ip_summed = CHECKSUM_PARTIAL; - skb->csum_start = skb_headroom(skb) + - skb_network_offset(skb) + - iph->ihl * 4; - skb->csum_offset = offsetof(struct tcphdr, check); - tcph->check = ~tcp_v4_check(datalen, - iph->saddr, iph->daddr, 0); - } else { - tcph->check = 0; - tcph->check = tcp_v4_check(datalen, - iph->saddr, iph->daddr, - csum_partial(tcph, - datalen, 0)); - } - } else - inet_proto_csum_replace2(&tcph->check, skb, - htons(oldlen), htons(datalen), 1); + nf_nat_csum(skb, iph, tcph, datalen, &tcph->check, oldlen); if (adjust && rep_len != match_len) nf_nat_set_seq_adjust(ct, ctinfo, tcph->seq, @@ -240,7 +249,6 @@ nf_nat_mangle_udp_packet(struct sk_buff *skb, const char *rep_buffer, unsigned int rep_len) { - struct rtable *rt = skb_rtable(skb); struct iphdr *iph; struct udphdr *udph; int datalen, oldlen; @@ -274,29 +282,7 @@ nf_nat_mangle_udp_packet(struct sk_buff *skb, if (!udph->check && skb->ip_summed != CHECKSUM_PARTIAL) return 1; - if (skb->ip_summed != CHECKSUM_PARTIAL) { - if (!(rt->rt_flags & RTCF_LOCAL) && - skb->dev->features & NETIF_F_V4_CSUM) { - skb->ip_summed = CHECKSUM_PARTIAL; - skb->csum_start = skb_headroom(skb) + - skb_network_offset(skb) + - iph->ihl * 4; - skb->csum_offset = offsetof(struct udphdr, check); - udph->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, - datalen, IPPROTO_UDP, - 0); - } else { - udph->check = 0; - udph->check = csum_tcpudp_magic(iph->saddr, iph->daddr, - datalen, IPPROTO_UDP, - csum_partial(udph, - datalen, 0)); - if (!udph->check) - udph->check = CSUM_MANGLED_0; - } - } else - inet_proto_csum_replace2(&udph->check, skb, - htons(oldlen), htons(datalen), 1); + nf_nat_csum(skb, iph, udph, datalen, &udph->check, oldlen); return 1; } diff --git a/net/ipv4/netfilter/nf_nat_irc.c b/net/ipv4/netfilter/nf_nat_irc.c index ea83a886b03e..535e1a802356 100644 --- a/net/ipv4/netfilter/nf_nat_irc.c +++ b/net/ipv4/netfilter/nf_nat_irc.c @@ -45,9 +45,16 @@ static unsigned int help(struct sk_buff *skb, /* Try to get same port: if not, try to change it. */ for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { + int ret; + exp->tuple.dst.u.tcp.port = htons(port); - if (nf_ct_expect_related(exp) == 0) + ret = nf_ct_expect_related(exp); + if (ret == 0) + break; + else if (ret != -EBUSY) { + port = 0; break; + } } if (port == 0) diff --git a/net/ipv4/netfilter/nf_nat_proto_common.c b/net/ipv4/netfilter/nf_nat_proto_common.c index 6c4f11f51446..3e61faf23a9a 100644 --- a/net/ipv4/netfilter/nf_nat_proto_common.c +++ b/net/ipv4/netfilter/nf_nat_proto_common.c @@ -34,7 +34,7 @@ bool nf_nat_proto_in_range(const struct nf_conntrack_tuple *tuple, } EXPORT_SYMBOL_GPL(nf_nat_proto_in_range); -bool nf_nat_proto_unique_tuple(struct nf_conntrack_tuple *tuple, +void nf_nat_proto_unique_tuple(struct nf_conntrack_tuple *tuple, const struct nf_nat_range *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct, @@ -53,7 +53,7 @@ bool nf_nat_proto_unique_tuple(struct nf_conntrack_tuple *tuple, if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) { /* If it's dst rewrite, can't change port */ if (maniptype == IP_NAT_MANIP_DST) - return false; + return; if (ntohs(*portptr) < 1024) { /* Loose convention: >> 512 is credential passing */ @@ -81,15 +81,15 @@ bool nf_nat_proto_unique_tuple(struct nf_conntrack_tuple *tuple, else off = *rover; - for (i = 0; i < range_size; i++, off++) { + for (i = 0; ; ++off) { *portptr = htons(min + off % range_size); - if (nf_nat_used_tuple(tuple, ct)) + if (++i != range_size && nf_nat_used_tuple(tuple, ct)) continue; if (!(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) *rover = off; - return true; + return; } - return false; + return; } EXPORT_SYMBOL_GPL(nf_nat_proto_unique_tuple); diff --git a/net/ipv4/netfilter/nf_nat_proto_dccp.c b/net/ipv4/netfilter/nf_nat_proto_dccp.c index 22485ce306d4..570faf2667b2 100644 --- a/net/ipv4/netfilter/nf_nat_proto_dccp.c +++ b/net/ipv4/netfilter/nf_nat_proto_dccp.c @@ -22,14 +22,14 @@ static u_int16_t dccp_port_rover; -static bool +static void dccp_unique_tuple(struct nf_conntrack_tuple *tuple, const struct nf_nat_range *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct) { - return nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, - &dccp_port_rover); + nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, + &dccp_port_rover); } static bool diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c index d7e89201351e..bc8d83a31c73 100644 --- a/net/ipv4/netfilter/nf_nat_proto_gre.c +++ b/net/ipv4/netfilter/nf_nat_proto_gre.c @@ -37,7 +37,7 @@ MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>"); MODULE_DESCRIPTION("Netfilter NAT protocol helper module for GRE"); /* generate unique tuple ... */ -static bool +static void gre_unique_tuple(struct nf_conntrack_tuple *tuple, const struct nf_nat_range *range, enum nf_nat_manip_type maniptype, @@ -50,7 +50,7 @@ gre_unique_tuple(struct nf_conntrack_tuple *tuple, /* If there is no master conntrack we are not PPTP, do not change tuples */ if (!ct->master) - return false; + return; if (maniptype == IP_NAT_MANIP_SRC) keyptr = &tuple->src.u.gre.key; @@ -68,14 +68,14 @@ gre_unique_tuple(struct nf_conntrack_tuple *tuple, pr_debug("min = %u, range_size = %u\n", min, range_size); - for (i = 0; i < range_size; i++, key++) { + for (i = 0; ; ++key) { *keyptr = htons(min + key % range_size); - if (!nf_nat_used_tuple(tuple, ct)) - return true; + if (++i == range_size || !nf_nat_used_tuple(tuple, ct)) + return; } pr_debug("%p: no NAT mapping\n", ct); - return false; + return; } /* manipulate a GRE packet according to maniptype */ diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c b/net/ipv4/netfilter/nf_nat_proto_icmp.c index 19a8b0b07d8e..5744c3ec847c 100644 --- a/net/ipv4/netfilter/nf_nat_proto_icmp.c +++ b/net/ipv4/netfilter/nf_nat_proto_icmp.c @@ -27,7 +27,7 @@ icmp_in_range(const struct nf_conntrack_tuple *tuple, ntohs(tuple->src.u.icmp.id) <= ntohs(max->icmp.id); } -static bool +static void icmp_unique_tuple(struct nf_conntrack_tuple *tuple, const struct nf_nat_range *range, enum nf_nat_manip_type maniptype, @@ -42,13 +42,13 @@ icmp_unique_tuple(struct nf_conntrack_tuple *tuple, if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) range_size = 0xFFFF; - for (i = 0; i < range_size; i++, id++) { + for (i = 0; ; ++id) { tuple->src.u.icmp.id = htons(ntohs(range->min.icmp.id) + (id % range_size)); - if (!nf_nat_used_tuple(tuple, ct)) - return true; + if (++i == range_size || !nf_nat_used_tuple(tuple, ct)) + return; } - return false; + return; } static bool diff --git a/net/ipv4/netfilter/nf_nat_proto_sctp.c b/net/ipv4/netfilter/nf_nat_proto_sctp.c index 3fc598eeeb1a..756331d42661 100644 --- a/net/ipv4/netfilter/nf_nat_proto_sctp.c +++ b/net/ipv4/netfilter/nf_nat_proto_sctp.c @@ -16,14 +16,14 @@ static u_int16_t nf_sctp_port_rover; -static bool +static void sctp_unique_tuple(struct nf_conntrack_tuple *tuple, const struct nf_nat_range *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct) { - return nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, - &nf_sctp_port_rover); + nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, + &nf_sctp_port_rover); } static bool diff --git a/net/ipv4/netfilter/nf_nat_proto_tcp.c b/net/ipv4/netfilter/nf_nat_proto_tcp.c index 399e2cfa263b..aa460a595d5d 100644 --- a/net/ipv4/netfilter/nf_nat_proto_tcp.c +++ b/net/ipv4/netfilter/nf_nat_proto_tcp.c @@ -20,14 +20,13 @@ static u_int16_t tcp_port_rover; -static bool +static void tcp_unique_tuple(struct nf_conntrack_tuple *tuple, const struct nf_nat_range *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct) { - return nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, - &tcp_port_rover); + nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, &tcp_port_rover); } static bool diff --git a/net/ipv4/netfilter/nf_nat_proto_udp.c b/net/ipv4/netfilter/nf_nat_proto_udp.c index 9e61c79492e4..dfe65c7e2925 100644 --- a/net/ipv4/netfilter/nf_nat_proto_udp.c +++ b/net/ipv4/netfilter/nf_nat_proto_udp.c @@ -19,14 +19,13 @@ static u_int16_t udp_port_rover; -static bool +static void udp_unique_tuple(struct nf_conntrack_tuple *tuple, const struct nf_nat_range *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct) { - return nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, - &udp_port_rover); + nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, &udp_port_rover); } static bool diff --git a/net/ipv4/netfilter/nf_nat_proto_udplite.c b/net/ipv4/netfilter/nf_nat_proto_udplite.c index 440a229bbd87..3cc8c8af39ef 100644 --- a/net/ipv4/netfilter/nf_nat_proto_udplite.c +++ b/net/ipv4/netfilter/nf_nat_proto_udplite.c @@ -18,14 +18,14 @@ static u_int16_t udplite_port_rover; -static bool +static void udplite_unique_tuple(struct nf_conntrack_tuple *tuple, const struct nf_nat_range *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct) { - return nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, - &udplite_port_rover); + nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, + &udplite_port_rover); } static bool diff --git a/net/ipv4/netfilter/nf_nat_proto_unknown.c b/net/ipv4/netfilter/nf_nat_proto_unknown.c index 14381c62acea..a50f2bc1c732 100644 --- a/net/ipv4/netfilter/nf_nat_proto_unknown.c +++ b/net/ipv4/netfilter/nf_nat_proto_unknown.c @@ -26,14 +26,14 @@ static bool unknown_in_range(const struct nf_conntrack_tuple *tuple, return true; } -static bool unknown_unique_tuple(struct nf_conntrack_tuple *tuple, +static void unknown_unique_tuple(struct nf_conntrack_tuple *tuple, const struct nf_nat_range *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct) { /* Sorry: we can't help you; if it's not unique, we can't frob anything. */ - return false; + return; } static bool diff --git a/net/ipv4/netfilter/nf_nat_rule.c b/net/ipv4/netfilter/nf_nat_rule.c index 98ed78281aee..21c30426480b 100644 --- a/net/ipv4/netfilter/nf_nat_rule.c +++ b/net/ipv4/netfilter/nf_nat_rule.c @@ -28,7 +28,8 @@ #define NAT_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | \ (1 << NF_INET_POST_ROUTING) | \ - (1 << NF_INET_LOCAL_OUT)) + (1 << NF_INET_LOCAL_OUT) | \ + (1 << NF_INET_LOCAL_IN)) static const struct xt_table nat_table = { .name = "nat", @@ -45,7 +46,8 @@ ipt_snat_target(struct sk_buff *skb, const struct xt_action_param *par) enum ip_conntrack_info ctinfo; const struct nf_nat_multi_range_compat *mr = par->targinfo; - NF_CT_ASSERT(par->hooknum == NF_INET_POST_ROUTING); + NF_CT_ASSERT(par->hooknum == NF_INET_POST_ROUTING || + par->hooknum == NF_INET_LOCAL_IN); ct = nf_ct_get(skb, &ctinfo); @@ -99,21 +101,20 @@ static int ipt_dnat_checkentry(const struct xt_tgchk_param *par) return 0; } -unsigned int +static unsigned int alloc_null_binding(struct nf_conn *ct, unsigned int hooknum) { /* Force range to this IP; let proto decide mapping for per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED). - Use reply in case it's already been mangled (eg local packet). */ - __be32 ip - = (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC - ? ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip - : ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip); - struct nf_nat_range range - = { IP_NAT_RANGE_MAP_IPS, ip, ip, { 0 }, { 0 } }; - - pr_debug("Allocating NULL binding for %p (%pI4)\n", ct, &ip); + struct nf_nat_range range; + + range.flags = 0; + pr_debug("Allocating NULL binding for %p (%pI4)\n", ct, + HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC ? + &ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip : + &ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip); + return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum)); } @@ -141,7 +142,7 @@ static struct xt_target ipt_snat_reg __read_mostly = { .target = ipt_snat_target, .targetsize = sizeof(struct nf_nat_multi_range_compat), .table = "nat", - .hooks = 1 << NF_INET_POST_ROUTING, + .hooks = (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_IN), .checkentry = ipt_snat_checkentry, .family = AF_INET, }; diff --git a/net/ipv4/netfilter/nf_nat_sip.c b/net/ipv4/netfilter/nf_nat_sip.c index 11b538deaaec..e40cf7816fdb 100644 --- a/net/ipv4/netfilter/nf_nat_sip.c +++ b/net/ipv4/netfilter/nf_nat_sip.c @@ -307,9 +307,16 @@ static unsigned int ip_nat_sip_expect(struct sk_buff *skb, unsigned int dataoff, exp->expectfn = ip_nat_sip_expected; for (; port != 0; port++) { + int ret; + exp->tuple.dst.u.udp.port = htons(port); - if (nf_ct_expect_related(exp) == 0) + ret = nf_ct_expect_related(exp); + if (ret == 0) + break; + else if (ret != -EBUSY) { + port = 0; break; + } } if (port == 0) @@ -480,13 +487,25 @@ static unsigned int ip_nat_sdp_media(struct sk_buff *skb, unsigned int dataoff, /* Try to get same pair of ports: if not, try to change them. */ for (port = ntohs(rtp_exp->tuple.dst.u.udp.port); port != 0; port += 2) { + int ret; + rtp_exp->tuple.dst.u.udp.port = htons(port); - if (nf_ct_expect_related(rtp_exp) != 0) + ret = nf_ct_expect_related(rtp_exp); + if (ret == -EBUSY) continue; + else if (ret < 0) { + port = 0; + break; + } rtcp_exp->tuple.dst.u.udp.port = htons(port + 1); - if (nf_ct_expect_related(rtcp_exp) == 0) + ret = nf_ct_expect_related(rtcp_exp); + if (ret == 0) break; - nf_ct_unexpect_related(rtp_exp); + else if (ret != -EBUSY) { + nf_ct_unexpect_related(rtp_exp); + port = 0; + break; + } } if (port == 0) diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c index 1679e2c0963d..ee5f419d0a56 100644 --- a/net/ipv4/netfilter/nf_nat_snmp_basic.c +++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c @@ -893,13 +893,15 @@ static void fast_csum(__sum16 *csum, unsigned char s[4]; if (offset & 1) { - s[0] = s[2] = 0; + s[0] = ~0; s[1] = ~*optr; + s[2] = 0; s[3] = *nptr; } else { - s[1] = s[3] = 0; s[0] = ~*optr; + s[1] = ~0; s[2] = *nptr; + s[3] = 0; } *csum = csum_fold(csum_partial(s, 4, ~csum_unfold(*csum))); diff --git a/net/ipv4/netfilter/nf_nat_standalone.c b/net/ipv4/netfilter/nf_nat_standalone.c index beb25819c9c9..95481fee8bdb 100644 --- a/net/ipv4/netfilter/nf_nat_standalone.c +++ b/net/ipv4/netfilter/nf_nat_standalone.c @@ -98,7 +98,7 @@ nf_nat_fn(unsigned int hooknum, return NF_ACCEPT; /* Don't try to NAT if this packet is not conntracked */ - if (ct == &nf_conntrack_untracked) + if (nf_ct_is_untracked(ct)) return NF_ACCEPT; nat = nfct_nat(ct); @@ -131,13 +131,7 @@ nf_nat_fn(unsigned int hooknum, if (!nf_nat_initialized(ct, maniptype)) { unsigned int ret; - if (hooknum == NF_INET_LOCAL_IN) - /* LOCAL_IN hook doesn't have a chain! */ - ret = alloc_null_binding(ct, hooknum); - else - ret = nf_nat_rule_find(skb, hooknum, in, out, - ct); - + ret = nf_nat_rule_find(skb, hooknum, in, out, ct); if (ret != NF_ACCEPT) return ret; } else diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 3dc9914c1dce..4ae1f203f7cb 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -252,6 +252,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPBacklogDrop", LINUX_MIB_TCPBACKLOGDROP), SNMP_MIB_ITEM("TCPMinTTLDrop", LINUX_MIB_TCPMINTTLDROP), SNMP_MIB_ITEM("TCPDeferAcceptDrop", LINUX_MIB_TCPDEFERACCEPTDROP), + SNMP_MIB_ITEM("IPReversePathFilter", LINUX_MIB_IPRPFILTER), SNMP_MIB_SENTINEL }; @@ -342,10 +343,12 @@ static int snmp_seq_show(struct seq_file *seq, void *v) IPV4_DEVCONF_ALL(net, FORWARDING) ? 1 : 2, sysctl_ip_default_ttl); + BUILD_BUG_ON(offsetof(struct ipstats_mib, mibs) != 0); for (i = 0; snmp4_ipstats_list[i].name != NULL; i++) - seq_printf(seq, " %lu", - snmp_fold_field((void __percpu **)net->mib.ip_statistics, - snmp4_ipstats_list[i].entry)); + seq_printf(seq, " %llu", + snmp_fold_field64((void __percpu **)net->mib.ip_statistics, + snmp4_ipstats_list[i].entry, + offsetof(struct ipstats_mib, syncp))); icmp_put(seq); /* RFC 2011 compatibility */ icmpmsg_put(seq); @@ -431,9 +434,10 @@ static int netstat_seq_show(struct seq_file *seq, void *v) seq_puts(seq, "\nIpExt:"); for (i = 0; snmp4_ipextstats_list[i].name != NULL; i++) - seq_printf(seq, " %lu", - snmp_fold_field((void __percpu **)net->mib.ip_statistics, - snmp4_ipextstats_list[i].entry)); + seq_printf(seq, " %llu", + snmp_fold_field64((void __percpu **)net->mib.ip_statistics, + snmp4_ipextstats_list[i].entry, + offsetof(struct ipstats_mib, syncp))); seq_putc(seq, '\n'); return 0; diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c index 542f22fc98b3..9ae5c01cd0b2 100644 --- a/net/ipv4/protocol.c +++ b/net/ipv4/protocol.c @@ -28,8 +28,7 @@ #include <linux/spinlock.h> #include <net/protocol.h> -const struct net_protocol *inet_protos[MAX_INET_PROTOS] ____cacheline_aligned_in_smp; -static DEFINE_SPINLOCK(inet_proto_lock); +const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly; /* * Add a protocol handler to the hash tables @@ -37,21 +36,12 @@ static DEFINE_SPINLOCK(inet_proto_lock); int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol) { - int hash, ret; + int hash = protocol & (MAX_INET_PROTOS - 1); - hash = protocol & (MAX_INET_PROTOS - 1); - - spin_lock_bh(&inet_proto_lock); - if (inet_protos[hash]) { - ret = -1; - } else { - inet_protos[hash] = prot; - ret = 0; - } - spin_unlock_bh(&inet_proto_lock); - - return ret; + return !cmpxchg((const struct net_protocol **)&inet_protos[hash], + NULL, prot) ? 0 : -1; } +EXPORT_SYMBOL(inet_add_protocol); /* * Remove a protocol from the hash tables. @@ -59,23 +49,13 @@ int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol) int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol) { - int hash, ret; + int ret, hash = protocol & (MAX_INET_PROTOS - 1); - hash = protocol & (MAX_INET_PROTOS - 1); - - spin_lock_bh(&inet_proto_lock); - if (inet_protos[hash] == prot) { - inet_protos[hash] = NULL; - ret = 0; - } else { - ret = -1; - } - spin_unlock_bh(&inet_proto_lock); + ret = (cmpxchg((const struct net_protocol **)&inet_protos[hash], + prot, NULL) == prot) ? 0 : -1; synchronize_net(); return ret; } - -EXPORT_SYMBOL(inet_add_protocol); EXPORT_SYMBOL(inet_del_protocol); diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 2c7a1639388a..1f85ef289895 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -314,7 +314,7 @@ int raw_rcv(struct sock *sk, struct sk_buff *skb) } static int raw_send_hdrinc(struct sock *sk, void *from, size_t length, - struct rtable *rt, + struct rtable **rtp, unsigned int flags) { struct inet_sock *inet = inet_sk(sk); @@ -323,25 +323,27 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length, struct sk_buff *skb; unsigned int iphlen; int err; + struct rtable *rt = *rtp; - if (length > rt->u.dst.dev->mtu) { + if (length > rt->dst.dev->mtu) { ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, - rt->u.dst.dev->mtu); + rt->dst.dev->mtu); return -EMSGSIZE; } if (flags&MSG_PROBE) goto out; skb = sock_alloc_send_skb(sk, - length + LL_ALLOCATED_SPACE(rt->u.dst.dev) + 15, + length + LL_ALLOCATED_SPACE(rt->dst.dev) + 15, flags & MSG_DONTWAIT, &err); if (skb == NULL) goto error; - skb_reserve(skb, LL_RESERVED_SPACE(rt->u.dst.dev)); + skb_reserve(skb, LL_RESERVED_SPACE(rt->dst.dev)); skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; - skb_dst_set(skb, dst_clone(&rt->u.dst)); + skb_dst_set(skb, &rt->dst); + *rtp = NULL; skb_reset_network_header(skb); iph = ip_hdr(skb); @@ -373,7 +375,7 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length, iph->check = 0; iph->tot_len = htons(length); if (!iph->id) - ip_select_ident(iph, &rt->u.dst, NULL); + ip_select_ident(iph, &rt->dst, NULL); iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); } @@ -382,7 +384,7 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length, skb_transport_header(skb))->type); err = NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL, - rt->u.dst.dev, dst_output); + rt->dst.dev, dst_output); if (err > 0) err = net_xmit_errno(err); if (err) @@ -503,7 +505,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, ipc.addr = inet->inet_saddr; ipc.opt = NULL; - ipc.shtx.flags = 0; + ipc.tx_flags = 0; ipc.oif = sk->sk_bound_dev_if; if (msg->msg_controllen) { @@ -576,7 +578,7 @@ back_from_confirm: if (inet->hdrincl) err = raw_send_hdrinc(sk, msg->msg_iov, len, - rt, msg->msg_flags); + &rt, msg->msg_flags); else { if (!ipc.addr) @@ -604,7 +606,7 @@ out: return len; do_confirm: - dst_confirm(&rt->u.dst); + dst_confirm(&rt->dst); if (!(msg->msg_flags & MSG_PROBE) || len) goto back_from_confirm; err = 0; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 560acc677ce4..987bf9adb318 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -159,7 +159,6 @@ static struct dst_ops ipv4_dst_ops = { .link_failure = ipv4_link_failure, .update_pmtu = ip_rt_update_pmtu, .local_out = __ip_local_out, - .entries = ATOMIC_INIT(0), }; #define ECN_OR_COST(class) TC_PRIO_##class @@ -199,7 +198,7 @@ const __u8 ip_tos2prio[16] = { */ struct rt_hash_bucket { - struct rtable *chain; + struct rtable __rcu *chain; }; #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \ @@ -253,8 +252,7 @@ static unsigned rt_hash_mask __read_mostly; static unsigned int rt_hash_log __read_mostly; static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); -#define RT_CACHE_STAT_INC(field) \ - (__raw_get_cpu_var(rt_cache_stat).field++) +#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field) static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx, int genid) @@ -282,15 +280,15 @@ static struct rtable *rt_cache_get_first(struct seq_file *seq) struct rtable *r = NULL; for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) { - if (!rt_hash_table[st->bucket].chain) + if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain)) continue; rcu_read_lock_bh(); r = rcu_dereference_bh(rt_hash_table[st->bucket].chain); while (r) { - if (dev_net(r->u.dst.dev) == seq_file_net(seq) && + if (dev_net(r->dst.dev) == seq_file_net(seq) && r->rt_genid == st->genid) return r; - r = rcu_dereference_bh(r->u.dst.rt_next); + r = rcu_dereference_bh(r->dst.rt_next); } rcu_read_unlock_bh(); } @@ -302,17 +300,17 @@ static struct rtable *__rt_cache_get_next(struct seq_file *seq, { struct rt_cache_iter_state *st = seq->private; - r = r->u.dst.rt_next; + r = rcu_dereference_bh(r->dst.rt_next); while (!r) { rcu_read_unlock_bh(); do { if (--st->bucket < 0) return NULL; - } while (!rt_hash_table[st->bucket].chain); + } while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain)); rcu_read_lock_bh(); - r = rt_hash_table[st->bucket].chain; + r = rcu_dereference_bh(rt_hash_table[st->bucket].chain); } - return rcu_dereference_bh(r); + return r; } static struct rtable *rt_cache_get_next(struct seq_file *seq, @@ -320,7 +318,7 @@ static struct rtable *rt_cache_get_next(struct seq_file *seq, { struct rt_cache_iter_state *st = seq->private; while ((r = __rt_cache_get_next(seq, r)) != NULL) { - if (dev_net(r->u.dst.dev) != seq_file_net(seq)) + if (dev_net(r->dst.dev) != seq_file_net(seq)) continue; if (r->rt_genid == st->genid) break; @@ -378,19 +376,19 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t" "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n", - r->u.dst.dev ? r->u.dst.dev->name : "*", + r->dst.dev ? r->dst.dev->name : "*", (__force u32)r->rt_dst, (__force u32)r->rt_gateway, - r->rt_flags, atomic_read(&r->u.dst.__refcnt), - r->u.dst.__use, 0, (__force u32)r->rt_src, - (dst_metric(&r->u.dst, RTAX_ADVMSS) ? - (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0), - dst_metric(&r->u.dst, RTAX_WINDOW), - (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) + - dst_metric(&r->u.dst, RTAX_RTTVAR)), + r->rt_flags, atomic_read(&r->dst.__refcnt), + r->dst.__use, 0, (__force u32)r->rt_src, + (dst_metric(&r->dst, RTAX_ADVMSS) ? + (int)dst_metric(&r->dst, RTAX_ADVMSS) + 40 : 0), + dst_metric(&r->dst, RTAX_WINDOW), + (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) + + dst_metric(&r->dst, RTAX_RTTVAR)), r->fl.fl4_tos, - r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1, - r->u.dst.hh ? (r->u.dst.hh->hh_output == + r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1, + r->dst.hh ? (r->dst.hh->hh_output == dev_queue_xmit) : 0, r->rt_spec_dst, &len); @@ -467,7 +465,7 @@ static int rt_cpu_seq_show(struct seq_file *seq, void *v) seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x " " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n", - atomic_read(&ipv4_dst_ops.entries), + dst_entries_get_slow(&ipv4_dst_ops), st->in_hit, st->in_slow_tot, st->in_slow_mc, @@ -609,13 +607,13 @@ static inline int ip_rt_proc_init(void) static inline void rt_free(struct rtable *rt) { - call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free); + call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free); } static inline void rt_drop(struct rtable *rt) { ip_rt_put(rt); - call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free); + call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free); } static inline int rt_fast_clean(struct rtable *rth) @@ -623,13 +621,13 @@ static inline int rt_fast_clean(struct rtable *rth) /* Kill broadcast/multicast entries very aggresively, if they collide in hash table with more useful entries */ return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) && - rth->fl.iif && rth->u.dst.rt_next; + rth->fl.iif && rth->dst.rt_next; } static inline int rt_valuable(struct rtable *rth) { return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) || - rth->u.dst.expires; + rth->dst.expires; } static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2) @@ -637,15 +635,15 @@ static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long t unsigned long age; int ret = 0; - if (atomic_read(&rth->u.dst.__refcnt)) + if (atomic_read(&rth->dst.__refcnt)) goto out; ret = 1; - if (rth->u.dst.expires && - time_after_eq(jiffies, rth->u.dst.expires)) + if (rth->dst.expires && + time_after_eq(jiffies, rth->dst.expires)) goto out; - age = jiffies - rth->u.dst.lastuse; + age = jiffies - rth->dst.lastuse; ret = 0; if ((age <= tmo1 && !rt_fast_clean(rth)) || (age <= tmo2 && rt_valuable(rth))) @@ -661,7 +659,7 @@ out: return ret; */ static inline u32 rt_score(struct rtable *rt) { - u32 score = jiffies - rt->u.dst.lastuse; + u32 score = jiffies - rt->dst.lastuse; score = ~score & ~(3<<30); @@ -701,12 +699,12 @@ static inline int compare_keys(struct flowi *fl1, struct flowi *fl2) static inline int compare_netns(struct rtable *rt1, struct rtable *rt2) { - return net_eq(dev_net(rt1->u.dst.dev), dev_net(rt2->u.dst.dev)); + return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev)); } static inline int rt_is_expired(struct rtable *rth) { - return rth->rt_genid != rt_genid(dev_net(rth->u.dst.dev)); + return rth->rt_genid != rt_genid(dev_net(rth->dst.dev)); } /* @@ -723,19 +721,23 @@ static void rt_do_flush(int process_context) for (i = 0; i <= rt_hash_mask; i++) { if (process_context && need_resched()) cond_resched(); - rth = rt_hash_table[i].chain; + rth = rcu_dereference_raw(rt_hash_table[i].chain); if (!rth) continue; spin_lock_bh(rt_hash_lock_addr(i)); #ifdef CONFIG_NET_NS { - struct rtable ** prev, * p; + struct rtable __rcu **prev; + struct rtable *p; - rth = rt_hash_table[i].chain; + rth = rcu_dereference_protected(rt_hash_table[i].chain, + lockdep_is_held(rt_hash_lock_addr(i))); /* defer releasing the head of the list after spin_unlock */ - for (tail = rth; tail; tail = tail->u.dst.rt_next) + for (tail = rth; tail; + tail = rcu_dereference_protected(tail->dst.rt_next, + lockdep_is_held(rt_hash_lock_addr(i)))) if (!rt_is_expired(tail)) break; if (rth != tail) @@ -743,10 +745,14 @@ static void rt_do_flush(int process_context) /* call rt_free on entries after the tail requiring flush */ prev = &rt_hash_table[i].chain; - for (p = *prev; p; p = next) { - next = p->u.dst.rt_next; + for (p = rcu_dereference_protected(*prev, + lockdep_is_held(rt_hash_lock_addr(i))); + p != NULL; + p = next) { + next = rcu_dereference_protected(p->dst.rt_next, + lockdep_is_held(rt_hash_lock_addr(i))); if (!rt_is_expired(p)) { - prev = &p->u.dst.rt_next; + prev = &p->dst.rt_next; } else { *prev = next; rt_free(p); @@ -754,14 +760,15 @@ static void rt_do_flush(int process_context) } } #else - rth = rt_hash_table[i].chain; - rt_hash_table[i].chain = NULL; + rth = rcu_dereference_protected(rt_hash_table[i].chain, + lockdep_is_held(rt_hash_lock_addr(i))); + rcu_assign_pointer(rt_hash_table[i].chain, NULL); tail = NULL; #endif spin_unlock_bh(rt_hash_lock_addr(i)); for (; rth != tail; rth = next) { - next = rth->u.dst.rt_next; + next = rcu_dereference_protected(rth->dst.rt_next, 1); rt_free(rth); } } @@ -792,7 +799,7 @@ static int has_noalias(const struct rtable *head, const struct rtable *rth) while (aux != rth) { if (compare_hash_inputs(&aux->fl, &rth->fl)) return 0; - aux = aux->u.dst.rt_next; + aux = rcu_dereference_protected(aux->dst.rt_next, 1); } return ONE; } @@ -801,7 +808,8 @@ static void rt_check_expire(void) { static unsigned int rover; unsigned int i = rover, goal; - struct rtable *rth, **rthp; + struct rtable *rth; + struct rtable __rcu **rthp; unsigned long samples = 0; unsigned long sum = 0, sum2 = 0; unsigned long delta; @@ -827,23 +835,24 @@ static void rt_check_expire(void) samples++; - if (*rthp == NULL) + if (rcu_dereference_raw(*rthp) == NULL) continue; length = 0; spin_lock_bh(rt_hash_lock_addr(i)); - while ((rth = *rthp) != NULL) { - prefetch(rth->u.dst.rt_next); + while ((rth = rcu_dereference_protected(*rthp, + lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) { + prefetch(rth->dst.rt_next); if (rt_is_expired(rth)) { - *rthp = rth->u.dst.rt_next; + *rthp = rth->dst.rt_next; rt_free(rth); continue; } - if (rth->u.dst.expires) { + if (rth->dst.expires) { /* Entry is expired even if it is in use */ - if (time_before_eq(jiffies, rth->u.dst.expires)) { + if (time_before_eq(jiffies, rth->dst.expires)) { nofree: tmo >>= 1; - rthp = &rth->u.dst.rt_next; + rthp = &rth->dst.rt_next; /* * We only count entries on * a chain with equal hash inputs once @@ -859,7 +868,7 @@ nofree: goto nofree; /* Cleanup aged off entries. */ - *rthp = rth->u.dst.rt_next; + *rthp = rth->dst.rt_next; rt_free(rth); } spin_unlock_bh(rt_hash_lock_addr(i)); @@ -943,9 +952,11 @@ static int rt_garbage_collect(struct dst_ops *ops) static unsigned long last_gc; static int rover; static int equilibrium; - struct rtable *rth, **rthp; + struct rtable *rth; + struct rtable __rcu **rthp; unsigned long now = jiffies; int goal; + int entries = dst_entries_get_fast(&ipv4_dst_ops); /* * Garbage collection is pretty expensive, @@ -955,28 +966,28 @@ static int rt_garbage_collect(struct dst_ops *ops) RT_CACHE_STAT_INC(gc_total); if (now - last_gc < ip_rt_gc_min_interval && - atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) { + entries < ip_rt_max_size) { RT_CACHE_STAT_INC(gc_ignored); goto out; } + entries = dst_entries_get_slow(&ipv4_dst_ops); /* Calculate number of entries, which we want to expire now. */ - goal = atomic_read(&ipv4_dst_ops.entries) - - (ip_rt_gc_elasticity << rt_hash_log); + goal = entries - (ip_rt_gc_elasticity << rt_hash_log); if (goal <= 0) { if (equilibrium < ipv4_dst_ops.gc_thresh) equilibrium = ipv4_dst_ops.gc_thresh; - goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium; + goal = entries - equilibrium; if (goal > 0) { equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1); - goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium; + goal = entries - equilibrium; } } else { /* We are in dangerous area. Try to reduce cache really * aggressively. */ goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1); - equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal; + equilibrium = entries - goal; } if (now - last_gc >= ip_rt_gc_min_interval) @@ -996,14 +1007,15 @@ static int rt_garbage_collect(struct dst_ops *ops) k = (k + 1) & rt_hash_mask; rthp = &rt_hash_table[k].chain; spin_lock_bh(rt_hash_lock_addr(k)); - while ((rth = *rthp) != NULL) { + while ((rth = rcu_dereference_protected(*rthp, + lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) { if (!rt_is_expired(rth) && !rt_may_expire(rth, tmo, expire)) { tmo >>= 1; - rthp = &rth->u.dst.rt_next; + rthp = &rth->dst.rt_next; continue; } - *rthp = rth->u.dst.rt_next; + *rthp = rth->dst.rt_next; rt_free(rth); goal--; } @@ -1033,14 +1045,16 @@ static int rt_garbage_collect(struct dst_ops *ops) expire >>= 1; #if RT_CACHE_DEBUG >= 2 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire, - atomic_read(&ipv4_dst_ops.entries), goal, i); + dst_entries_get_fast(&ipv4_dst_ops), goal, i); #endif - if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) + if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size) goto out; } while (!in_softirq() && time_before_eq(jiffies, now)); - if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) + if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size) + goto out; + if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size) goto out; if (net_ratelimit()) printk(KERN_WARNING "dst cache overflow\n"); @@ -1050,11 +1064,12 @@ static int rt_garbage_collect(struct dst_ops *ops) work_done: expire += ip_rt_gc_min_interval; if (expire > ip_rt_gc_timeout || - atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh) + dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh || + dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh) expire = ip_rt_gc_timeout; #if RT_CACHE_DEBUG >= 2 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire, - atomic_read(&ipv4_dst_ops.entries), goal, rover); + dst_entries_get_fast(&ipv4_dst_ops), goal, rover); #endif out: return 0; } @@ -1069,7 +1084,7 @@ static int slow_chain_length(const struct rtable *head) while (rth) { length += has_noalias(head, rth); - rth = rth->u.dst.rt_next; + rth = rcu_dereference_protected(rth->dst.rt_next, 1); } return length >> FRACT_BITS; } @@ -1077,9 +1092,9 @@ static int slow_chain_length(const struct rtable *head) static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp, struct sk_buff *skb, int ifindex) { - struct rtable *rth, **rthp; + struct rtable *rth, *cand; + struct rtable __rcu **rthp, **candp; unsigned long now; - struct rtable *cand, **candp; u32 min_score; int chain_length; int attempts = !in_softirq(); @@ -1091,7 +1106,7 @@ restart: candp = NULL; now = jiffies; - if (!rt_caching(dev_net(rt->u.dst.dev))) { + if (!rt_caching(dev_net(rt->dst.dev))) { /* * If we're not caching, just tell the caller we * were successful and don't touch the route. The @@ -1103,44 +1118,45 @@ restart: * Note that we do rt_free on this new route entry, so that * once its refcount hits zero, we are still able to reap it * (Thanks Alexey) - * Note also the rt_free uses call_rcu. We don't actually - * need rcu protection here, this is just our path to get - * on the route gc list. + * Note: To avoid expensive rcu stuff for this uncached dst, + * we set DST_NOCACHE so that dst_release() can free dst without + * waiting a grace period. */ + rt->dst.flags |= DST_NOCACHE; if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) { - int err = arp_bind_neighbour(&rt->u.dst); + int err = arp_bind_neighbour(&rt->dst); if (err) { if (net_ratelimit()) printk(KERN_WARNING "Neighbour table failure & not caching routes.\n"); - rt_drop(rt); + ip_rt_put(rt); return err; } } - rt_free(rt); goto skip_hashing; } rthp = &rt_hash_table[hash].chain; spin_lock_bh(rt_hash_lock_addr(hash)); - while ((rth = *rthp) != NULL) { + while ((rth = rcu_dereference_protected(*rthp, + lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) { if (rt_is_expired(rth)) { - *rthp = rth->u.dst.rt_next; + *rthp = rth->dst.rt_next; rt_free(rth); continue; } if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) { /* Put it first */ - *rthp = rth->u.dst.rt_next; + *rthp = rth->dst.rt_next; /* * Since lookup is lockfree, the deletion * must be visible to another weakly ordered CPU before * the insertion at the start of the hash chain. */ - rcu_assign_pointer(rth->u.dst.rt_next, + rcu_assign_pointer(rth->dst.rt_next, rt_hash_table[hash].chain); /* * Since lookup is lockfree, the update writes @@ -1148,18 +1164,18 @@ restart: */ rcu_assign_pointer(rt_hash_table[hash].chain, rth); - dst_use(&rth->u.dst, now); + dst_use(&rth->dst, now); spin_unlock_bh(rt_hash_lock_addr(hash)); rt_drop(rt); if (rp) *rp = rth; else - skb_dst_set(skb, &rth->u.dst); + skb_dst_set(skb, &rth->dst); return 0; } - if (!atomic_read(&rth->u.dst.__refcnt)) { + if (!atomic_read(&rth->dst.__refcnt)) { u32 score = rt_score(rth); if (score <= min_score) { @@ -1171,7 +1187,7 @@ restart: chain_length++; - rthp = &rth->u.dst.rt_next; + rthp = &rth->dst.rt_next; } if (cand) { @@ -1182,17 +1198,17 @@ restart: * only 2 entries per bucket. We will see. */ if (chain_length > ip_rt_gc_elasticity) { - *candp = cand->u.dst.rt_next; + *candp = cand->dst.rt_next; rt_free(cand); } } else { if (chain_length > rt_chain_length_max && slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) { - struct net *net = dev_net(rt->u.dst.dev); + struct net *net = dev_net(rt->dst.dev); int num = ++net->ipv4.current_rt_cache_rebuild_count; if (!rt_caching(net)) { printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n", - rt->u.dst.dev->name, num); + rt->dst.dev->name, num); } rt_emergency_hash_rebuild(net); spin_unlock_bh(rt_hash_lock_addr(hash)); @@ -1207,7 +1223,7 @@ restart: route or unicast forwarding path. */ if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) { - int err = arp_bind_neighbour(&rt->u.dst); + int err = arp_bind_neighbour(&rt->dst); if (err) { spin_unlock_bh(rt_hash_lock_addr(hash)); @@ -1232,20 +1248,20 @@ restart: } if (net_ratelimit()) - printk(KERN_WARNING "Neighbour table overflow.\n"); + printk(KERN_WARNING "ipv4: Neighbour table overflow.\n"); rt_drop(rt); return -ENOBUFS; } } - rt->u.dst.rt_next = rt_hash_table[hash].chain; + rt->dst.rt_next = rt_hash_table[hash].chain; #if RT_CACHE_DEBUG >= 2 - if (rt->u.dst.rt_next) { + if (rt->dst.rt_next) { struct rtable *trt; printk(KERN_DEBUG "rt_cache @%02x: %pI4", hash, &rt->rt_dst); - for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next) + for (trt = rt->dst.rt_next; trt; trt = trt->dst.rt_next) printk(" . %pI4", &trt->rt_dst); printk("\n"); } @@ -1263,24 +1279,17 @@ skip_hashing: if (rp) *rp = rt; else - skb_dst_set(skb, &rt->u.dst); + skb_dst_set(skb, &rt->dst); return 0; } void rt_bind_peer(struct rtable *rt, int create) { - static DEFINE_SPINLOCK(rt_peer_lock); struct inet_peer *peer; peer = inet_getpeer(rt->rt_dst, create); - spin_lock_bh(&rt_peer_lock); - if (rt->peer == NULL) { - rt->peer = peer; - peer = NULL; - } - spin_unlock_bh(&rt_peer_lock); - if (peer) + if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL) inet_putpeer(peer); } @@ -1325,31 +1334,36 @@ void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more) ip_select_fb_ident(iph); } +EXPORT_SYMBOL(__ip_select_ident); static void rt_del(unsigned hash, struct rtable *rt) { - struct rtable **rthp, *aux; + struct rtable __rcu **rthp; + struct rtable *aux; rthp = &rt_hash_table[hash].chain; spin_lock_bh(rt_hash_lock_addr(hash)); ip_rt_put(rt); - while ((aux = *rthp) != NULL) { + while ((aux = rcu_dereference_protected(*rthp, + lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) { if (aux == rt || rt_is_expired(aux)) { - *rthp = aux->u.dst.rt_next; + *rthp = aux->dst.rt_next; rt_free(aux); continue; } - rthp = &aux->u.dst.rt_next; + rthp = &aux->dst.rt_next; } spin_unlock_bh(rt_hash_lock_addr(hash)); } +/* called in rcu_read_lock() section */ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, __be32 saddr, struct net_device *dev) { int i, k; - struct in_device *in_dev = in_dev_get(dev); - struct rtable *rth, **rthp; + struct in_device *in_dev = __in_dev_get_rcu(dev); + struct rtable *rth; + struct rtable __rcu **rthp; __be32 skeys[2] = { saddr, 0 }; int ikeys[2] = { dev->ifindex, 0 }; struct netevent_redirect netevent; @@ -1382,9 +1396,8 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, unsigned hash = rt_hash(daddr, skeys[i], ikeys[k], rt_genid(net)); - rthp=&rt_hash_table[hash].chain; + rthp = &rt_hash_table[hash].chain; - rcu_read_lock(); while ((rth = rcu_dereference(*rthp)) != NULL) { struct rtable *rt; @@ -1393,44 +1406,42 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, rth->fl.oif != ikeys[k] || rth->fl.iif != 0 || rt_is_expired(rth) || - !net_eq(dev_net(rth->u.dst.dev), net)) { - rthp = &rth->u.dst.rt_next; + !net_eq(dev_net(rth->dst.dev), net)) { + rthp = &rth->dst.rt_next; continue; } if (rth->rt_dst != daddr || rth->rt_src != saddr || - rth->u.dst.error || + rth->dst.error || rth->rt_gateway != old_gw || - rth->u.dst.dev != dev) + rth->dst.dev != dev) break; - dst_hold(&rth->u.dst); - rcu_read_unlock(); + dst_hold(&rth->dst); rt = dst_alloc(&ipv4_dst_ops); if (rt == NULL) { ip_rt_put(rth); - in_dev_put(in_dev); return; } /* Copy all the information. */ *rt = *rth; - rt->u.dst.__use = 1; - atomic_set(&rt->u.dst.__refcnt, 1); - rt->u.dst.child = NULL; - if (rt->u.dst.dev) - dev_hold(rt->u.dst.dev); + rt->dst.__use = 1; + atomic_set(&rt->dst.__refcnt, 1); + rt->dst.child = NULL; + if (rt->dst.dev) + dev_hold(rt->dst.dev); if (rt->idev) in_dev_hold(rt->idev); - rt->u.dst.obsolete = -1; - rt->u.dst.lastuse = jiffies; - rt->u.dst.path = &rt->u.dst; - rt->u.dst.neighbour = NULL; - rt->u.dst.hh = NULL; + rt->dst.obsolete = -1; + rt->dst.lastuse = jiffies; + rt->dst.path = &rt->dst; + rt->dst.neighbour = NULL; + rt->dst.hh = NULL; #ifdef CONFIG_XFRM - rt->u.dst.xfrm = NULL; + rt->dst.xfrm = NULL; #endif rt->rt_genid = rt_genid(net); rt->rt_flags |= RTCF_REDIRECTED; @@ -1439,23 +1450,23 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, rt->rt_gateway = new_gw; /* Redirect received -> path was valid */ - dst_confirm(&rth->u.dst); + dst_confirm(&rth->dst); if (rt->peer) atomic_inc(&rt->peer->refcnt); - if (arp_bind_neighbour(&rt->u.dst) || - !(rt->u.dst.neighbour->nud_state & + if (arp_bind_neighbour(&rt->dst) || + !(rt->dst.neighbour->nud_state & NUD_VALID)) { - if (rt->u.dst.neighbour) - neigh_event_send(rt->u.dst.neighbour, NULL); + if (rt->dst.neighbour) + neigh_event_send(rt->dst.neighbour, NULL); ip_rt_put(rth); rt_drop(rt); goto do_next; } - netevent.old = &rth->u.dst; - netevent.new = &rt->u.dst; + netevent.old = &rth->dst; + netevent.new = &rt->dst; call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); @@ -1464,12 +1475,10 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, ip_rt_put(rt); goto do_next; } - rcu_read_unlock(); do_next: ; } } - in_dev_put(in_dev); return; reject_redirect: @@ -1480,7 +1489,7 @@ reject_redirect: &old_gw, dev->name, &new_gw, &saddr, &daddr); #endif - in_dev_put(in_dev); + ; } static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) @@ -1493,8 +1502,8 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) ip_rt_put(rt); ret = NULL; } else if ((rt->rt_flags & RTCF_REDIRECTED) || - (rt->u.dst.expires && - time_after_eq(jiffies, rt->u.dst.expires))) { + (rt->dst.expires && + time_after_eq(jiffies, rt->dst.expires))) { unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src, rt->fl.oif, rt_genid(dev_net(dst->dev))); @@ -1532,7 +1541,7 @@ void ip_rt_send_redirect(struct sk_buff *skb) int log_martians; rcu_read_lock(); - in_dev = __in_dev_get_rcu(rt->u.dst.dev); + in_dev = __in_dev_get_rcu(rt->dst.dev); if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) { rcu_read_unlock(); return; @@ -1543,30 +1552,30 @@ void ip_rt_send_redirect(struct sk_buff *skb) /* No redirected packets during ip_rt_redirect_silence; * reset the algorithm. */ - if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence)) - rt->u.dst.rate_tokens = 0; + if (time_after(jiffies, rt->dst.rate_last + ip_rt_redirect_silence)) + rt->dst.rate_tokens = 0; /* Too many ignored redirects; do not send anything - * set u.dst.rate_last to the last seen redirected packet. + * set dst.rate_last to the last seen redirected packet. */ - if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) { - rt->u.dst.rate_last = jiffies; + if (rt->dst.rate_tokens >= ip_rt_redirect_number) { + rt->dst.rate_last = jiffies; return; } /* Check for load limit; set rate_last to the latest sent * redirect. */ - if (rt->u.dst.rate_tokens == 0 || + if (rt->dst.rate_tokens == 0 || time_after(jiffies, - (rt->u.dst.rate_last + - (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) { + (rt->dst.rate_last + + (ip_rt_redirect_load << rt->dst.rate_tokens)))) { icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); - rt->u.dst.rate_last = jiffies; - ++rt->u.dst.rate_tokens; + rt->dst.rate_last = jiffies; + ++rt->dst.rate_tokens; #ifdef CONFIG_IP_ROUTE_VERBOSE if (log_martians && - rt->u.dst.rate_tokens == ip_rt_redirect_number && + rt->dst.rate_tokens == ip_rt_redirect_number && net_ratelimit()) printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n", &rt->rt_src, rt->rt_iif, @@ -1581,7 +1590,7 @@ static int ip_error(struct sk_buff *skb) unsigned long now; int code; - switch (rt->u.dst.error) { + switch (rt->dst.error) { case EINVAL: default: goto out; @@ -1590,7 +1599,7 @@ static int ip_error(struct sk_buff *skb) break; case ENETUNREACH: code = ICMP_NET_UNREACH; - IP_INC_STATS_BH(dev_net(rt->u.dst.dev), + IP_INC_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INNOROUTES); break; case EACCES: @@ -1599,12 +1608,12 @@ static int ip_error(struct sk_buff *skb) } now = jiffies; - rt->u.dst.rate_tokens += now - rt->u.dst.rate_last; - if (rt->u.dst.rate_tokens > ip_rt_error_burst) - rt->u.dst.rate_tokens = ip_rt_error_burst; - rt->u.dst.rate_last = now; - if (rt->u.dst.rate_tokens >= ip_rt_error_cost) { - rt->u.dst.rate_tokens -= ip_rt_error_cost; + rt->dst.rate_tokens += now - rt->dst.rate_last; + if (rt->dst.rate_tokens > ip_rt_error_burst) + rt->dst.rate_tokens = ip_rt_error_burst; + rt->dst.rate_last = now; + if (rt->dst.rate_tokens >= ip_rt_error_cost) { + rt->dst.rate_tokens -= ip_rt_error_cost; icmp_send(skb, ICMP_DEST_UNREACH, code, 0); } @@ -1649,7 +1658,7 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph, rcu_read_lock(); for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; - rth = rcu_dereference(rth->u.dst.rt_next)) { + rth = rcu_dereference(rth->dst.rt_next)) { unsigned short mtu = new_mtu; if (rth->fl.fl4_dst != daddr || @@ -1658,8 +1667,8 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph, rth->rt_src != iph->saddr || rth->fl.oif != ikeys[k] || rth->fl.iif != 0 || - dst_metric_locked(&rth->u.dst, RTAX_MTU) || - !net_eq(dev_net(rth->u.dst.dev), net) || + dst_metric_locked(&rth->dst, RTAX_MTU) || + !net_eq(dev_net(rth->dst.dev), net) || rt_is_expired(rth)) continue; @@ -1667,22 +1676,22 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph, /* BSD 4.2 compatibility hack :-( */ if (mtu == 0 && - old_mtu >= dst_mtu(&rth->u.dst) && + old_mtu >= dst_mtu(&rth->dst) && old_mtu >= 68 + (iph->ihl << 2)) old_mtu -= iph->ihl << 2; mtu = guess_mtu(old_mtu); } - if (mtu <= dst_mtu(&rth->u.dst)) { - if (mtu < dst_mtu(&rth->u.dst)) { - dst_confirm(&rth->u.dst); + if (mtu <= dst_mtu(&rth->dst)) { + if (mtu < dst_mtu(&rth->dst)) { + dst_confirm(&rth->dst); if (mtu < ip_rt_min_pmtu) { mtu = ip_rt_min_pmtu; - rth->u.dst.metrics[RTAX_LOCK-1] |= + rth->dst.metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU); } - rth->u.dst.metrics[RTAX_MTU-1] = mtu; - dst_set_expires(&rth->u.dst, + rth->dst.metrics[RTAX_MTU-1] = mtu; + dst_set_expires(&rth->dst, ip_rt_mtu_expires); } est_mtu = mtu; @@ -1755,7 +1764,7 @@ static void ipv4_link_failure(struct sk_buff *skb) rt = skb_rtable(skb); if (rt) - dst_set_expires(&rt->u.dst, 0); + dst_set_expires(&rt->dst, 0); } static int ip_rt_bug(struct sk_buff *skb) @@ -1783,22 +1792,25 @@ void ip_rt_get_source(u8 *addr, struct rtable *rt) if (rt->fl.iif == 0) src = rt->rt_src; - else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) { - src = FIB_RES_PREFSRC(res); - fib_res_put(&res); - } else - src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway, + else { + rcu_read_lock(); + if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0) + src = FIB_RES_PREFSRC(res); + else + src = inet_select_addr(rt->dst.dev, rt->rt_gateway, RT_SCOPE_UNIVERSE); + rcu_read_unlock(); + } memcpy(addr, &src, 4); } #ifdef CONFIG_NET_CLS_ROUTE static void set_class_tag(struct rtable *rt, u32 tag) { - if (!(rt->u.dst.tclassid & 0xFFFF)) - rt->u.dst.tclassid |= tag & 0xFFFF; - if (!(rt->u.dst.tclassid & 0xFFFF0000)) - rt->u.dst.tclassid |= tag & 0xFFFF0000; + if (!(rt->dst.tclassid & 0xFFFF)) + rt->dst.tclassid |= tag & 0xFFFF; + if (!(rt->dst.tclassid & 0xFFFF0000)) + rt->dst.tclassid |= tag & 0xFFFF0000; } #endif @@ -1810,30 +1822,30 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag) if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) rt->rt_gateway = FIB_RES_GW(*res); - memcpy(rt->u.dst.metrics, fi->fib_metrics, - sizeof(rt->u.dst.metrics)); + memcpy(rt->dst.metrics, fi->fib_metrics, + sizeof(rt->dst.metrics)); if (fi->fib_mtu == 0) { - rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu; - if (dst_metric_locked(&rt->u.dst, RTAX_MTU) && + rt->dst.metrics[RTAX_MTU-1] = rt->dst.dev->mtu; + if (dst_metric_locked(&rt->dst, RTAX_MTU) && rt->rt_gateway != rt->rt_dst && - rt->u.dst.dev->mtu > 576) - rt->u.dst.metrics[RTAX_MTU-1] = 576; + rt->dst.dev->mtu > 576) + rt->dst.metrics[RTAX_MTU-1] = 576; } #ifdef CONFIG_NET_CLS_ROUTE - rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid; + rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid; #endif } else - rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu; - - if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0) - rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl; - if (dst_mtu(&rt->u.dst) > IP_MAX_MTU) - rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU; - if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0) - rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40, + rt->dst.metrics[RTAX_MTU-1]= rt->dst.dev->mtu; + + if (dst_metric(&rt->dst, RTAX_HOPLIMIT) == 0) + rt->dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl; + if (dst_mtu(&rt->dst) > IP_MAX_MTU) + rt->dst.metrics[RTAX_MTU-1] = IP_MAX_MTU; + if (dst_metric(&rt->dst, RTAX_ADVMSS) == 0) + rt->dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->dst.dev->mtu - 40, ip_rt_min_advmss); - if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40) - rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40; + if (dst_metric(&rt->dst, RTAX_ADVMSS) > 65535 - 40) + rt->dst.metrics[RTAX_ADVMSS-1] = 65535 - 40; #ifdef CONFIG_NET_CLS_ROUTE #ifdef CONFIG_IP_MULTIPLE_TABLES @@ -1844,14 +1856,16 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag) rt->rt_type = res->type; } +/* called in rcu_read_lock() section */ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, u8 tos, struct net_device *dev, int our) { - unsigned hash; + unsigned int hash; struct rtable *rth; __be32 spec_dst; - struct in_device *in_dev = in_dev_get(dev); + struct in_device *in_dev = __in_dev_get_rcu(dev); u32 itag = 0; + int err; /* Primary sanity checks. */ @@ -1866,21 +1880,23 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (!ipv4_is_local_multicast(daddr)) goto e_inval; spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); - } else if (fib_validate_source(saddr, 0, tos, 0, - dev, &spec_dst, &itag, 0) < 0) - goto e_inval; - + } else { + err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, + &itag, 0); + if (err < 0) + goto e_err; + } rth = dst_alloc(&ipv4_dst_ops); if (!rth) goto e_nobufs; - rth->u.dst.output = ip_rt_bug; - rth->u.dst.obsolete = -1; + rth->dst.output = ip_rt_bug; + rth->dst.obsolete = -1; - atomic_set(&rth->u.dst.__refcnt, 1); - rth->u.dst.flags= DST_HOST; + atomic_set(&rth->dst.__refcnt, 1); + rth->dst.flags= DST_HOST; if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) - rth->u.dst.flags |= DST_NOPOLICY; + rth->dst.flags |= DST_NOPOLICY; rth->fl.fl4_dst = daddr; rth->rt_dst = daddr; rth->fl.fl4_tos = tos; @@ -1888,13 +1904,13 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, rth->fl.fl4_src = saddr; rth->rt_src = saddr; #ifdef CONFIG_NET_CLS_ROUTE - rth->u.dst.tclassid = itag; + rth->dst.tclassid = itag; #endif rth->rt_iif = rth->fl.iif = dev->ifindex; - rth->u.dst.dev = init_net.loopback_dev; - dev_hold(rth->u.dst.dev); - rth->idev = in_dev_get(rth->u.dst.dev); + rth->dst.dev = init_net.loopback_dev; + dev_hold(rth->dst.dev); + rth->idev = in_dev_get(rth->dst.dev); rth->fl.oif = 0; rth->rt_gateway = daddr; rth->rt_spec_dst= spec_dst; @@ -1902,27 +1918,25 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, rth->rt_flags = RTCF_MULTICAST; rth->rt_type = RTN_MULTICAST; if (our) { - rth->u.dst.input= ip_local_deliver; + rth->dst.input= ip_local_deliver; rth->rt_flags |= RTCF_LOCAL; } #ifdef CONFIG_IP_MROUTE if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev)) - rth->u.dst.input = ip_mr_input; + rth->dst.input = ip_mr_input; #endif RT_CACHE_STAT_INC(in_slow_mc); - in_dev_put(in_dev); hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev))); return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex); e_nobufs: - in_dev_put(in_dev); return -ENOBUFS; - e_inval: - in_dev_put(in_dev); return -EINVAL; +e_err: + return err; } @@ -1956,22 +1970,22 @@ static void ip_handle_martian_source(struct net_device *dev, #endif } +/* called in rcu_read_lock() section */ static int __mkroute_input(struct sk_buff *skb, struct fib_result *res, struct in_device *in_dev, __be32 daddr, __be32 saddr, u32 tos, struct rtable **result) { - struct rtable *rth; int err; struct in_device *out_dev; - unsigned flags = 0; + unsigned int flags = 0; __be32 spec_dst; u32 itag; /* get a working reference to the output device */ - out_dev = in_dev_get(FIB_RES_DEV(*res)); + out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res)); if (out_dev == NULL) { if (net_ratelimit()) printk(KERN_CRIT "Bug in ip_route_input" \ @@ -1986,7 +2000,6 @@ static int __mkroute_input(struct sk_buff *skb, ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, saddr); - err = -EINVAL; goto cleanup; } @@ -2020,12 +2033,12 @@ static int __mkroute_input(struct sk_buff *skb, goto cleanup; } - atomic_set(&rth->u.dst.__refcnt, 1); - rth->u.dst.flags= DST_HOST; + atomic_set(&rth->dst.__refcnt, 1); + rth->dst.flags= DST_HOST; if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) - rth->u.dst.flags |= DST_NOPOLICY; + rth->dst.flags |= DST_NOPOLICY; if (IN_DEV_CONF_GET(out_dev, NOXFRM)) - rth->u.dst.flags |= DST_NOXFRM; + rth->dst.flags |= DST_NOXFRM; rth->fl.fl4_dst = daddr; rth->rt_dst = daddr; rth->fl.fl4_tos = tos; @@ -2035,16 +2048,16 @@ static int __mkroute_input(struct sk_buff *skb, rth->rt_gateway = daddr; rth->rt_iif = rth->fl.iif = in_dev->dev->ifindex; - rth->u.dst.dev = (out_dev)->dev; - dev_hold(rth->u.dst.dev); - rth->idev = in_dev_get(rth->u.dst.dev); + rth->dst.dev = (out_dev)->dev; + dev_hold(rth->dst.dev); + rth->idev = in_dev_get(rth->dst.dev); rth->fl.oif = 0; rth->rt_spec_dst= spec_dst; - rth->u.dst.obsolete = -1; - rth->u.dst.input = ip_forward; - rth->u.dst.output = ip_output; - rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev)); + rth->dst.obsolete = -1; + rth->dst.input = ip_forward; + rth->dst.output = ip_output; + rth->rt_genid = rt_genid(dev_net(rth->dst.dev)); rt_set_nexthop(rth, res, itag); @@ -2053,8 +2066,6 @@ static int __mkroute_input(struct sk_buff *skb, *result = rth; err = 0; cleanup: - /* release the working reference to the output device */ - in_dev_put(out_dev); return err; } @@ -2080,7 +2091,7 @@ static int ip_mkroute_input(struct sk_buff *skb, /* put it into the cache */ hash = rt_hash(daddr, saddr, fl->iif, - rt_genid(dev_net(rth->u.dst.dev))); + rt_genid(dev_net(rth->dst.dev))); return rt_intern_hash(hash, rth, NULL, skb, fl->iif); } @@ -2092,13 +2103,14 @@ static int ip_mkroute_input(struct sk_buff *skb, * Such approach solves two big problems: * 1. Not simplex devices are handled properly. * 2. IP spoofing attempts are filtered with 100% of guarantee. + * called with rcu_read_lock() */ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, u8 tos, struct net_device *dev) { struct fib_result res; - struct in_device *in_dev = in_dev_get(dev); + struct in_device *in_dev = __in_dev_get_rcu(dev); struct flowi fl = { .nl_u = { .ip4_u = { .daddr = daddr, .saddr = saddr, @@ -2113,7 +2125,6 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, unsigned hash; __be32 spec_dst; int err = -EINVAL; - int free_res = 0; struct net * net = dev_net(dev); /* IP on this device is disabled. */ @@ -2129,7 +2140,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, ipv4_is_loopback(saddr)) goto martian_source; - if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0)) + if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0)) goto brd_input; /* Accept zero addresses only to limited broadcast; @@ -2138,19 +2149,18 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (ipv4_is_zeronet(saddr)) goto martian_source; - if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) || - ipv4_is_loopback(daddr)) + if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr)) goto martian_destination; /* * Now we are ready to route packet. */ - if ((err = fib_lookup(net, &fl, &res)) != 0) { + err = fib_lookup(net, &fl, &res); + if (err != 0) { if (!IN_DEV_FORWARD(in_dev)) goto e_hostunreach; goto no_route; } - free_res = 1; RT_CACHE_STAT_INC(in_slow_tot); @@ -2158,13 +2168,12 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, goto brd_input; if (res.type == RTN_LOCAL) { - int result; - result = fib_validate_source(saddr, daddr, tos, - net->loopback_dev->ifindex, - dev, &spec_dst, &itag, skb->mark); - if (result < 0) - goto martian_source; - if (result) + err = fib_validate_source(saddr, daddr, tos, + net->loopback_dev->ifindex, + dev, &spec_dst, &itag, skb->mark); + if (err < 0) + goto martian_source_keep_err; + if (err) flags |= RTCF_DIRECTSRC; spec_dst = daddr; goto local_input; @@ -2176,10 +2185,6 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, goto martian_destination; err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos); -done: - in_dev_put(in_dev); - if (free_res) - fib_res_put(&res); out: return err; brd_input: @@ -2192,7 +2197,7 @@ brd_input: err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, &itag, skb->mark); if (err < 0) - goto martian_source; + goto martian_source_keep_err; if (err) flags |= RTCF_DIRECTSRC; } @@ -2205,14 +2210,14 @@ local_input: if (!rth) goto e_nobufs; - rth->u.dst.output= ip_rt_bug; - rth->u.dst.obsolete = -1; + rth->dst.output= ip_rt_bug; + rth->dst.obsolete = -1; rth->rt_genid = rt_genid(net); - atomic_set(&rth->u.dst.__refcnt, 1); - rth->u.dst.flags= DST_HOST; + atomic_set(&rth->dst.__refcnt, 1); + rth->dst.flags= DST_HOST; if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) - rth->u.dst.flags |= DST_NOPOLICY; + rth->dst.flags |= DST_NOPOLICY; rth->fl.fl4_dst = daddr; rth->rt_dst = daddr; rth->fl.fl4_tos = tos; @@ -2220,26 +2225,26 @@ local_input: rth->fl.fl4_src = saddr; rth->rt_src = saddr; #ifdef CONFIG_NET_CLS_ROUTE - rth->u.dst.tclassid = itag; + rth->dst.tclassid = itag; #endif rth->rt_iif = rth->fl.iif = dev->ifindex; - rth->u.dst.dev = net->loopback_dev; - dev_hold(rth->u.dst.dev); - rth->idev = in_dev_get(rth->u.dst.dev); + rth->dst.dev = net->loopback_dev; + dev_hold(rth->dst.dev); + rth->idev = in_dev_get(rth->dst.dev); rth->rt_gateway = daddr; rth->rt_spec_dst= spec_dst; - rth->u.dst.input= ip_local_deliver; + rth->dst.input= ip_local_deliver; rth->rt_flags = flags|RTCF_LOCAL; if (res.type == RTN_UNREACHABLE) { - rth->u.dst.input= ip_error; - rth->u.dst.error= -err; + rth->dst.input= ip_error; + rth->dst.error= -err; rth->rt_flags &= ~RTCF_LOCAL; } rth->rt_type = res.type; hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net)); err = rt_intern_hash(hash, rth, NULL, skb, fl.iif); - goto done; + goto out; no_route: RT_CACHE_STAT_INC(in_no_route); @@ -2262,19 +2267,21 @@ martian_destination: e_hostunreach: err = -EHOSTUNREACH; - goto done; + goto out; e_inval: err = -EINVAL; - goto done; + goto out; e_nobufs: err = -ENOBUFS; - goto done; + goto out; martian_source: + err = -EINVAL; +martian_source_keep_err: ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); - goto e_inval; + goto out; } int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr, @@ -2284,32 +2291,34 @@ int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr, unsigned hash; int iif = dev->ifindex; struct net *net; + int res; net = dev_net(dev); + rcu_read_lock(); + if (!rt_caching(net)) goto skip_cache; tos &= IPTOS_RT_MASK; hash = rt_hash(daddr, saddr, iif, rt_genid(net)); - rcu_read_lock(); for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; - rth = rcu_dereference(rth->u.dst.rt_next)) { + rth = rcu_dereference(rth->dst.rt_next)) { if ((((__force u32)rth->fl.fl4_dst ^ (__force u32)daddr) | ((__force u32)rth->fl.fl4_src ^ (__force u32)saddr) | (rth->fl.iif ^ iif) | rth->fl.oif | (rth->fl.fl4_tos ^ tos)) == 0 && rth->fl.mark == skb->mark && - net_eq(dev_net(rth->u.dst.dev), net) && + net_eq(dev_net(rth->dst.dev), net) && !rt_is_expired(rth)) { if (noref) { - dst_use_noref(&rth->u.dst, jiffies); - skb_dst_set_noref(skb, &rth->u.dst); + dst_use_noref(&rth->dst, jiffies); + skb_dst_set_noref(skb, &rth->dst); } else { - dst_use(&rth->u.dst, jiffies); - skb_dst_set(skb, &rth->u.dst); + dst_use(&rth->dst, jiffies); + skb_dst_set(skb, &rth->dst); } RT_CACHE_STAT_INC(in_hit); rcu_read_unlock(); @@ -2317,7 +2326,6 @@ int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr, } RT_CACHE_STAT_INC(in_hlist_search); } - rcu_read_unlock(); skip_cache: /* Multicast recognition logic is moved from route cache to here. @@ -2332,12 +2340,11 @@ skip_cache: route cache entry is created eventually. */ if (ipv4_is_multicast(daddr)) { - struct in_device *in_dev; + struct in_device *in_dev = __in_dev_get_rcu(dev); - rcu_read_lock(); - if ((in_dev = __in_dev_get_rcu(dev)) != NULL) { + if (in_dev) { int our = ip_check_mc(in_dev, daddr, saddr, - ip_hdr(skb)->protocol); + ip_hdr(skb)->protocol); if (our #ifdef CONFIG_IP_MROUTE || @@ -2345,18 +2352,22 @@ skip_cache: IN_DEV_MFORWARD(in_dev)) #endif ) { + int res = ip_route_input_mc(skb, daddr, saddr, + tos, dev, our); rcu_read_unlock(); - return ip_route_input_mc(skb, daddr, saddr, - tos, dev, our); + return res; } } rcu_read_unlock(); return -EINVAL; } - return ip_route_input_slow(skb, daddr, saddr, tos, dev); + res = ip_route_input_slow(skb, daddr, saddr, tos, dev); + rcu_read_unlock(); + return res; } EXPORT_SYMBOL(ip_route_input_common); +/* called with rcu_read_lock() */ static int __mkroute_output(struct rtable **result, struct fib_result *res, const struct flowi *fl, @@ -2367,60 +2378,54 @@ static int __mkroute_output(struct rtable **result, struct rtable *rth; struct in_device *in_dev; u32 tos = RT_FL_TOS(oldflp); - int err = 0; - if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK)) + if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags & IFF_LOOPBACK)) return -EINVAL; - if (fl->fl4_dst == htonl(0xFFFFFFFF)) + if (ipv4_is_lbcast(fl->fl4_dst)) res->type = RTN_BROADCAST; else if (ipv4_is_multicast(fl->fl4_dst)) res->type = RTN_MULTICAST; - else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst)) + else if (ipv4_is_zeronet(fl->fl4_dst)) return -EINVAL; if (dev_out->flags & IFF_LOOPBACK) flags |= RTCF_LOCAL; - /* get work reference to inet device */ - in_dev = in_dev_get(dev_out); + in_dev = __in_dev_get_rcu(dev_out); if (!in_dev) return -EINVAL; if (res->type == RTN_BROADCAST) { flags |= RTCF_BROADCAST | RTCF_LOCAL; - if (res->fi) { - fib_info_put(res->fi); - res->fi = NULL; - } + res->fi = NULL; } else if (res->type == RTN_MULTICAST) { - flags |= RTCF_MULTICAST|RTCF_LOCAL; + flags |= RTCF_MULTICAST | RTCF_LOCAL; if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src, oldflp->proto)) flags &= ~RTCF_LOCAL; /* If multicast route do not exist use - default one, but do not gateway in this case. - Yes, it is hack. + * default one, but do not gateway in this case. + * Yes, it is hack. */ - if (res->fi && res->prefixlen < 4) { - fib_info_put(res->fi); + if (res->fi && res->prefixlen < 4) res->fi = NULL; - } } rth = dst_alloc(&ipv4_dst_ops); - if (!rth) { - err = -ENOBUFS; - goto cleanup; - } + if (!rth) + return -ENOBUFS; + + in_dev_hold(in_dev); + rth->idev = in_dev; - atomic_set(&rth->u.dst.__refcnt, 1); - rth->u.dst.flags= DST_HOST; + atomic_set(&rth->dst.__refcnt, 1); + rth->dst.flags= DST_HOST; if (IN_DEV_CONF_GET(in_dev, NOXFRM)) - rth->u.dst.flags |= DST_NOXFRM; + rth->dst.flags |= DST_NOXFRM; if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) - rth->u.dst.flags |= DST_NOPOLICY; + rth->dst.flags |= DST_NOPOLICY; rth->fl.fl4_dst = oldflp->fl4_dst; rth->fl.fl4_tos = tos; @@ -2432,35 +2437,34 @@ static int __mkroute_output(struct rtable **result, rth->rt_iif = oldflp->oif ? : dev_out->ifindex; /* get references to the devices that are to be hold by the routing cache entry */ - rth->u.dst.dev = dev_out; + rth->dst.dev = dev_out; dev_hold(dev_out); - rth->idev = in_dev_get(dev_out); rth->rt_gateway = fl->fl4_dst; rth->rt_spec_dst= fl->fl4_src; - rth->u.dst.output=ip_output; - rth->u.dst.obsolete = -1; + rth->dst.output=ip_output; + rth->dst.obsolete = -1; rth->rt_genid = rt_genid(dev_net(dev_out)); RT_CACHE_STAT_INC(out_slow_tot); if (flags & RTCF_LOCAL) { - rth->u.dst.input = ip_local_deliver; + rth->dst.input = ip_local_deliver; rth->rt_spec_dst = fl->fl4_dst; } if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { rth->rt_spec_dst = fl->fl4_src; if (flags & RTCF_LOCAL && !(dev_out->flags & IFF_LOOPBACK)) { - rth->u.dst.output = ip_mc_output; + rth->dst.output = ip_mc_output; RT_CACHE_STAT_INC(out_slow_mc); } #ifdef CONFIG_IP_MROUTE if (res->type == RTN_MULTICAST) { if (IN_DEV_MFORWARD(in_dev) && !ipv4_is_local_multicast(oldflp->fl4_dst)) { - rth->u.dst.input = ip_mr_input; - rth->u.dst.output = ip_mc_output; + rth->dst.input = ip_mr_input; + rth->dst.output = ip_mc_output; } } #endif @@ -2469,15 +2473,11 @@ static int __mkroute_output(struct rtable **result, rt_set_nexthop(rth, res, 0); rth->rt_flags = flags; - *result = rth; - cleanup: - /* release work reference to inet device */ - in_dev_put(in_dev); - - return err; + return 0; } +/* called with rcu_read_lock() */ static int ip_mkroute_output(struct rtable **rp, struct fib_result *res, const struct flowi *fl, @@ -2499,6 +2499,7 @@ static int ip_mkroute_output(struct rtable **rp, /* * Major route resolver routine. + * called with rcu_read_lock(); */ static int ip_route_output_slow(struct net *net, struct rtable **rp, @@ -2517,9 +2518,8 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, .iif = net->loopback_dev->ifindex, .oif = oldflp->oif }; struct fib_result res; - unsigned flags = 0; + unsigned int flags = 0; struct net_device *dev_out = NULL; - int free_res = 0; int err; @@ -2545,9 +2545,9 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, if (oldflp->oif == 0 && (ipv4_is_multicast(oldflp->fl4_dst) || - oldflp->fl4_dst == htonl(0xFFFFFFFF))) { + ipv4_is_lbcast(oldflp->fl4_dst))) { /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ - dev_out = ip_dev_find(net, oldflp->fl4_src); + dev_out = __ip_dev_find(net, oldflp->fl4_src, false); if (dev_out == NULL) goto out; @@ -2572,29 +2572,24 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) { /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ - dev_out = ip_dev_find(net, oldflp->fl4_src); - if (dev_out == NULL) + if (!__ip_dev_find(net, oldflp->fl4_src, false)) goto out; - dev_put(dev_out); - dev_out = NULL; } } if (oldflp->oif) { - dev_out = dev_get_by_index(net, oldflp->oif); + dev_out = dev_get_by_index_rcu(net, oldflp->oif); err = -ENODEV; if (dev_out == NULL) goto out; /* RACE: Check return value of inet_select_addr instead. */ - if (__in_dev_get_rtnl(dev_out) == NULL) { - dev_put(dev_out); + if (rcu_dereference(dev_out->ip_ptr) == NULL) goto out; /* Wrong error code */ - } if (ipv4_is_local_multicast(oldflp->fl4_dst) || - oldflp->fl4_dst == htonl(0xFFFFFFFF)) { + ipv4_is_lbcast(oldflp->fl4_dst)) { if (!fl.fl4_src) fl.fl4_src = inet_select_addr(dev_out, 0, RT_SCOPE_LINK); @@ -2614,10 +2609,7 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, fl.fl4_dst = fl.fl4_src; if (!fl.fl4_dst) fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK); - if (dev_out) - dev_put(dev_out); dev_out = net->loopback_dev; - dev_hold(dev_out); fl.oif = net->loopback_dev->ifindex; res.type = RTN_LOCAL; flags |= RTCF_LOCAL; @@ -2651,23 +2643,15 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, res.type = RTN_UNICAST; goto make_route; } - if (dev_out) - dev_put(dev_out); err = -ENETUNREACH; goto out; } - free_res = 1; if (res.type == RTN_LOCAL) { if (!fl.fl4_src) fl.fl4_src = fl.fl4_dst; - if (dev_out) - dev_put(dev_out); dev_out = net->loopback_dev; - dev_hold(dev_out); fl.oif = dev_out->ifindex; - if (res.fi) - fib_info_put(res.fi); res.fi = NULL; flags |= RTCF_LOCAL; goto make_route; @@ -2684,28 +2668,21 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, if (!fl.fl4_src) fl.fl4_src = FIB_RES_PREFSRC(res); - if (dev_out) - dev_put(dev_out); dev_out = FIB_RES_DEV(res); - dev_hold(dev_out); fl.oif = dev_out->ifindex; make_route: err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags); - - if (free_res) - fib_res_put(&res); - if (dev_out) - dev_put(dev_out); out: return err; } int __ip_route_output_key(struct net *net, struct rtable **rp, const struct flowi *flp) { - unsigned hash; + unsigned int hash; + int res; struct rtable *rth; if (!rt_caching(net)) @@ -2715,7 +2692,7 @@ int __ip_route_output_key(struct net *net, struct rtable **rp, rcu_read_lock_bh(); for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth; - rth = rcu_dereference_bh(rth->u.dst.rt_next)) { + rth = rcu_dereference_bh(rth->dst.rt_next)) { if (rth->fl.fl4_dst == flp->fl4_dst && rth->fl.fl4_src == flp->fl4_src && rth->fl.iif == 0 && @@ -2723,9 +2700,9 @@ int __ip_route_output_key(struct net *net, struct rtable **rp, rth->fl.mark == flp->mark && !((rth->fl.fl4_tos ^ flp->fl4_tos) & (IPTOS_RT_MASK | RTO_ONLINK)) && - net_eq(dev_net(rth->u.dst.dev), net) && + net_eq(dev_net(rth->dst.dev), net) && !rt_is_expired(rth)) { - dst_use(&rth->u.dst, jiffies); + dst_use(&rth->dst, jiffies); RT_CACHE_STAT_INC(out_hit); rcu_read_unlock_bh(); *rp = rth; @@ -2736,11 +2713,18 @@ int __ip_route_output_key(struct net *net, struct rtable **rp, rcu_read_unlock_bh(); slow_output: - return ip_route_output_slow(net, rp, flp); + rcu_read_lock(); + res = ip_route_output_slow(net, rp, flp); + rcu_read_unlock(); + return res; } - EXPORT_SYMBOL_GPL(__ip_route_output_key); +static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie) +{ + return NULL; +} + static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu) { } @@ -2749,9 +2733,8 @@ static struct dst_ops ipv4_dst_blackhole_ops = { .family = AF_INET, .protocol = cpu_to_be16(ETH_P_IP), .destroy = ipv4_dst_destroy, - .check = ipv4_dst_check, + .check = ipv4_blackhole_dst_check, .update_pmtu = ipv4_rt_blackhole_update_pmtu, - .entries = ATOMIC_INIT(0), }; @@ -2762,15 +2745,15 @@ static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi dst_alloc(&ipv4_dst_blackhole_ops); if (rt) { - struct dst_entry *new = &rt->u.dst; + struct dst_entry *new = &rt->dst; atomic_set(&new->__refcnt, 1); new->__use = 1; new->input = dst_discard; new->output = dst_discard; - memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32)); + memcpy(new->metrics, ort->dst.metrics, RTAX_MAX*sizeof(u32)); - new->dev = ort->u.dst.dev; + new->dev = ort->dst.dev; if (new->dev) dev_hold(new->dev); @@ -2794,9 +2777,9 @@ static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi dst_free(new); } - dst_release(&(*rp)->u.dst); + dst_release(&(*rp)->dst); *rp = rt; - return (rt ? 0 : -ENOMEM); + return rt ? 0 : -ENOMEM; } int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp, @@ -2822,13 +2805,13 @@ int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp, return 0; } - EXPORT_SYMBOL_GPL(ip_route_output_flow); int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp) { return ip_route_output_flow(net, rp, flp, NULL, 0); } +EXPORT_SYMBOL(ip_route_output_key); static int rt_fill_info(struct net *net, struct sk_buff *skb, u32 pid, u32 seq, int event, @@ -2864,11 +2847,11 @@ static int rt_fill_info(struct net *net, r->rtm_src_len = 32; NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src); } - if (rt->u.dst.dev) - NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex); + if (rt->dst.dev) + NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex); #ifdef CONFIG_NET_CLS_ROUTE - if (rt->u.dst.tclassid) - NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid); + if (rt->dst.tclassid) + NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid); #endif if (rt->fl.iif) NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst); @@ -2878,12 +2861,16 @@ static int rt_fill_info(struct net *net, if (rt->rt_dst != rt->rt_gateway) NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway); - if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0) + if (rtnetlink_put_metrics(skb, rt->dst.metrics) < 0) goto nla_put_failure; - error = rt->u.dst.error; - expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0; + if (rt->fl.mark) + NLA_PUT_BE32(skb, RTA_MARK, rt->fl.mark); + + error = rt->dst.error; + expires = rt->dst.expires ? rt->dst.expires - jiffies : 0; if (rt->peer) { + inet_peer_refcheck(rt->peer); id = atomic_read(&rt->peer->ip_id_count) & 0xffff; if (rt->peer->tcp_ts_stamp) { ts = rt->peer->tcp_ts; @@ -2914,7 +2901,7 @@ static int rt_fill_info(struct net *net, NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif); } - if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage, + if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage, expires, error) < 0) goto nla_put_failure; @@ -2935,6 +2922,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void __be32 src = 0; u32 iif; int err; + int mark; struct sk_buff *skb; err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy); @@ -2962,6 +2950,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0; dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0; iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0; + mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0; if (iif) { struct net_device *dev; @@ -2974,13 +2963,14 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void skb->protocol = htons(ETH_P_IP); skb->dev = dev; + skb->mark = mark; local_bh_disable(); err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev); local_bh_enable(); rt = skb_rtable(skb); - if (err == 0 && rt->u.dst.error) - err = -rt->u.dst.error; + if (err == 0 && rt->dst.error) + err = -rt->dst.error; } else { struct flowi fl = { .nl_u = { @@ -2991,6 +2981,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void }, }, .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0, + .mark = mark, }; err = ip_route_output_key(net, &rt, &fl); } @@ -2998,7 +2989,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void if (err) goto errout_free; - skb_dst_set(skb, &rt->u.dst); + skb_dst_set(skb, &rt->dst); if (rtm->rtm_flags & RTM_F_NOTIFY) rt->rt_flags |= RTCF_NOTIFY; @@ -3034,12 +3025,12 @@ int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) continue; rcu_read_lock_bh(); for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt; - rt = rcu_dereference_bh(rt->u.dst.rt_next), idx++) { - if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx) + rt = rcu_dereference_bh(rt->dst.rt_next), idx++) { + if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx) continue; if (rt_is_expired(rt)) continue; - skb_dst_set_noref(skb, &rt->u.dst); + skb_dst_set_noref(skb, &rt->dst); if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, RTM_NEWROUTE, 1, NLM_F_MULTI) <= 0) { @@ -3313,6 +3304,12 @@ int __init ip_rt_init(void) ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep; + if (dst_entries_init(&ipv4_dst_ops) < 0) + panic("IP: failed to allocate ipv4_dst_ops counter\n"); + + if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0) + panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n"); + rt_hash_table = (struct rt_hash_bucket *) alloc_large_system_hash("IP route cache", sizeof(struct rt_hash_bucket), @@ -3365,6 +3362,3 @@ void __init ip_static_sysctl_init(void) register_sysctl_paths(ipv4_path, ipv4_skeleton); } #endif - -EXPORT_SYMBOL(__ip_select_ident); -EXPORT_SYMBOL(ip_route_output_key); diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 9f6b22206c52..650cace2180d 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -18,8 +18,8 @@ #include <net/tcp.h> #include <net/route.h> -/* Timestamps: lowest 9 bits store TCP options */ -#define TSBITS 9 +/* Timestamps: lowest bits store TCP options */ +#define TSBITS 6 #define TSMASK (((__u32)1 << TSBITS) - 1) extern int sysctl_tcp_syncookies; @@ -58,7 +58,7 @@ static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, /* * when syncookies are in effect and tcp timestamps are enabled we encode - * tcp options in the lowest 9 bits of the timestamp value that will be + * tcp options in the lower bits of the timestamp value that will be * sent in the syn-ack. * Since subsequent timestamps use the normal tcp_time_stamp value, we * must make sure that the resulting initial timestamp is <= tcp_time_stamp. @@ -70,11 +70,10 @@ __u32 cookie_init_timestamp(struct request_sock *req) u32 options = 0; ireq = inet_rsk(req); - if (ireq->wscale_ok) { - options = ireq->snd_wscale; - options |= ireq->rcv_wscale << 4; - } - options |= ireq->sack_ok << 8; + + options = ireq->wscale_ok ? ireq->snd_wscale : 0xf; + options |= ireq->sack_ok << 4; + options |= ireq->ecn_ok << 5; ts = ts_now & ~TSMASK; ts |= options; @@ -138,23 +137,23 @@ static __u32 check_tcp_syn_cookie(__u32 cookie, __be32 saddr, __be32 daddr, } /* - * This table has to be sorted and terminated with (__u16)-1. - * XXX generate a better table. - * Unresolved Issues: HIPPI with a 64k MSS is not well supported. + * MSS Values are taken from the 2009 paper + * 'Measuring TCP Maximum Segment Size' by S. Alcock and R. Nelson: + * - values 1440 to 1460 accounted for 80% of observed mss values + * - values outside the 536-1460 range are rare (<0.2%). + * + * Table must be sorted. */ static __u16 const msstab[] = { - 64 - 1, - 256 - 1, - 512 - 1, - 536 - 1, - 1024 - 1, - 1440 - 1, - 1460 - 1, - 4312 - 1, - (__u16)-1 + 64, + 512, + 536, + 1024, + 1440, + 1460, + 4312, + 8960, }; -/* The number doesn't include the -1 terminator */ -#define NUM_MSS (ARRAY_SIZE(msstab) - 1) /* * Generate a syncookie. mssp points to the mss, which is returned @@ -169,10 +168,10 @@ __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp) tcp_synq_overflow(sk); - /* XXX sort msstab[] by probability? Binary search? */ - for (mssind = 0; mss > msstab[mssind + 1]; mssind++) - ; - *mssp = msstab[mssind] + 1; + for (mssind = ARRAY_SIZE(msstab) - 1; mssind ; mssind--) + if (mss >= msstab[mssind]) + break; + *mssp = msstab[mssind]; NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT); @@ -202,7 +201,7 @@ static inline int cookie_check(struct sk_buff *skb, __u32 cookie) jiffies / (HZ * 60), COUNTER_TRIES); - return mssind < NUM_MSS ? msstab[mssind] + 1 : 0; + return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0; } static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, @@ -227,26 +226,38 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, * additional tcp options in the timestamp. * This extracts these options from the timestamp echo. * - * The lowest 4 bits are for snd_wscale - * The next 4 lsb are for rcv_wscale - * The next lsb is for sack_ok + * The lowest 4 bits store snd_wscale. + * next 2 bits indicate SACK and ECN support. + * + * return false if we decode an option that should not be. */ -void cookie_check_timestamp(struct tcp_options_received *tcp_opt) +bool cookie_check_timestamp(struct tcp_options_received *tcp_opt, bool *ecn_ok) { - /* echoed timestamp, 9 lowest bits contain options */ + /* echoed timestamp, lowest bits contain options */ u32 options = tcp_opt->rcv_tsecr & TSMASK; - tcp_opt->snd_wscale = options & 0xf; - options >>= 4; - tcp_opt->rcv_wscale = options & 0xf; + if (!tcp_opt->saw_tstamp) { + tcp_clear_options(tcp_opt); + return true; + } + + if (!sysctl_tcp_timestamps) + return false; tcp_opt->sack_ok = (options >> 4) & 0x1; + *ecn_ok = (options >> 5) & 1; + if (*ecn_ok && !sysctl_tcp_ecn) + return false; + + if (tcp_opt->sack_ok && !sysctl_tcp_sack) + return false; - if (tcp_opt->sack_ok) - tcp_sack_reset(tcp_opt); + if ((options & 0xf) == 0xf) + return true; /* no window scaling */ - if (tcp_opt->snd_wscale || tcp_opt->rcv_wscale) - tcp_opt->wscale_ok = 1; + tcp_opt->wscale_ok = 1; + tcp_opt->snd_wscale = options & 0xf; + return sysctl_tcp_window_scaling != 0; } EXPORT_SYMBOL(cookie_check_timestamp); @@ -265,8 +276,9 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, int mss; struct rtable *rt; __u8 rcv_wscale; + bool ecn_ok; - if (!sysctl_tcp_syncookies || !th->ack) + if (!sysctl_tcp_syncookies || !th->ack || th->rst) goto out; if (tcp_synq_no_recent_overflow(sk) || @@ -281,8 +293,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, memset(&tcp_opt, 0, sizeof(tcp_opt)); tcp_parse_options(skb, &tcp_opt, &hash_location, 0); - if (tcp_opt.saw_tstamp) - cookie_check_timestamp(&tcp_opt); + if (!cookie_check_timestamp(&tcp_opt, &ecn_ok)) + goto out; ret = NULL; req = inet_reqsk_alloc(&tcp_request_sock_ops); /* for safety */ @@ -298,9 +310,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, ireq->rmt_port = th->source; ireq->loc_addr = ip_hdr(skb)->daddr; ireq->rmt_addr = ip_hdr(skb)->saddr; - ireq->ecn_ok = 0; + ireq->ecn_ok = ecn_ok; ireq->snd_wscale = tcp_opt.snd_wscale; - ireq->rcv_wscale = tcp_opt.rcv_wscale; ireq->sack_ok = tcp_opt.sack_ok; ireq->wscale_ok = tcp_opt.wscale_ok; ireq->tstamp_ok = tcp_opt.saw_tstamp; @@ -354,15 +365,15 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, } /* Try to redo what tcp_v4_send_synack did. */ - req->window_clamp = tp->window_clamp ? :dst_metric(&rt->u.dst, RTAX_WINDOW); + req->window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW); tcp_select_initial_window(tcp_full_space(sk), req->mss, &req->rcv_wnd, &req->window_clamp, ireq->wscale_ok, &rcv_wscale, - dst_metric(&rt->u.dst, RTAX_INITRWND)); + dst_metric(&rt->dst, RTAX_INITRWND)); ireq->rcv_wscale = rcv_wscale; - ret = get_cookie_sock(sk, skb, req, &rt->u.dst); + ret = get_cookie_sock(sk, skb, req, &rt->dst); out: return ret; } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 65afeaec15b7..1664a0590bb8 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -315,7 +315,6 @@ struct tcp_splice_state { * is strict, actions are advisory and have some latency. */ int tcp_memory_pressure __read_mostly; - EXPORT_SYMBOL(tcp_memory_pressure); void tcp_enter_memory_pressure(struct sock *sk) @@ -325,7 +324,6 @@ void tcp_enter_memory_pressure(struct sock *sk) tcp_memory_pressure = 1; } } - EXPORT_SYMBOL(tcp_enter_memory_pressure); /* Convert seconds to retransmits based on initial and max timeout */ @@ -388,8 +386,6 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) */ mask = 0; - if (sk->sk_err) - mask = POLLERR; /* * POLLHUP is certainly not done right. But poll() doesn't @@ -453,13 +449,20 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) mask |= POLLOUT | POLLWRNORM; } - } + } else + mask |= POLLOUT | POLLWRNORM; if (tp->urg_data & TCP_URG_VALID) mask |= POLLPRI; } + /* This barrier is coupled with smp_wmb() in tcp_reset() */ + smp_rmb(); + if (sk->sk_err) + mask |= POLLERR; + return mask; } +EXPORT_SYMBOL(tcp_poll); int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) { @@ -508,10 +511,11 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) return put_user(answ, (int __user *)arg); } +EXPORT_SYMBOL(tcp_ioctl); static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb) { - TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; + TCP_SKB_CB(skb)->flags |= TCPHDR_PSH; tp->pushed_seq = tp->write_seq; } @@ -527,7 +531,7 @@ static inline void skb_entail(struct sock *sk, struct sk_buff *skb) skb->csum = 0; tcb->seq = tcb->end_seq = tp->write_seq; - tcb->flags = TCPCB_FLAG_ACK; + tcb->flags = TCPHDR_ACK; tcb->sacked = 0; skb_header_release(skb); tcp_add_write_queue_tail(sk, skb); @@ -676,6 +680,7 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos, return ret; } +EXPORT_SYMBOL(tcp_splice_read); struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp) { @@ -816,7 +821,7 @@ new_segment: skb_shinfo(skb)->gso_segs = 0; if (!copied) - TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH; + TCP_SKB_CB(skb)->flags &= ~TCPHDR_PSH; copied += copy; poffset += copy; @@ -857,15 +862,15 @@ out_err: return sk_stream_error(sk, flags, err); } -ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, - size_t size, int flags) +int tcp_sendpage(struct sock *sk, struct page *page, int offset, + size_t size, int flags) { ssize_t res; - struct sock *sk = sock->sk; if (!(sk->sk_route_caps & NETIF_F_SG) || !(sk->sk_route_caps & NETIF_F_ALL_CSUM)) - return sock_no_sendpage(sock, page, offset, size, flags); + return sock_no_sendpage(sk->sk_socket, page, offset, size, + flags); lock_sock(sk); TCP_CHECK_TIMER(sk); @@ -874,6 +879,7 @@ ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, release_sock(sk); return res; } +EXPORT_SYMBOL(tcp_sendpage); #define TCP_PAGE(sk) (sk->sk_sndmsg_page) #define TCP_OFF(sk) (sk->sk_sndmsg_off) @@ -898,10 +904,9 @@ static inline int select_size(struct sock *sk, int sg) return tmp; } -int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, +int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t size) { - struct sock *sk = sock->sk; struct iovec *iov; struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; @@ -938,7 +943,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, sg = sk->sk_route_caps & NETIF_F_SG; while (--iovlen >= 0) { - int seglen = iov->iov_len; + size_t seglen = iov->iov_len; unsigned char __user *from = iov->iov_base; iov++; @@ -1062,7 +1067,7 @@ new_segment: } if (!copied) - TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH; + TCP_SKB_CB(skb)->flags &= ~TCPHDR_PSH; tp->write_seq += copy; TCP_SKB_CB(skb)->end_seq += copy; @@ -1122,6 +1127,7 @@ out_err: release_sock(sk); return err; } +EXPORT_SYMBOL(tcp_sendmsg); /* * Handle reading urgent data. BSD has very simple semantics for @@ -1381,6 +1387,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, tcp_cleanup_rbuf(sk, copied); return copied; } +EXPORT_SYMBOL(tcp_read_sock); /* * This routine copies from a sock struct into the user buffer. @@ -1775,6 +1782,7 @@ recv_urg: err = tcp_recv_urg(sk, msg, len, flags); goto out; } +EXPORT_SYMBOL(tcp_recvmsg); void tcp_set_state(struct sock *sk, int state) { @@ -1867,6 +1875,7 @@ void tcp_shutdown(struct sock *sk, int how) tcp_send_fin(sk); } } +EXPORT_SYMBOL(tcp_shutdown); void tcp_close(struct sock *sk, long timeout) { @@ -1899,6 +1908,10 @@ void tcp_close(struct sock *sk, long timeout) sk_mem_reclaim(sk); + /* If socket has been already reset (e.g. in tcp_reset()) - kill it. */ + if (sk->sk_state == TCP_CLOSE) + goto adjudge_to_death; + /* As outlined in RFC 2525, section 2.17, we send a RST here because * data was lost. To witness the awful effects of the old behavior of * always doing a FIN, run an older 2.1.x kernel or 2.0.x, start a bulk @@ -2002,11 +2015,8 @@ adjudge_to_death: } } if (sk->sk_state != TCP_CLOSE) { - int orphan_count = percpu_counter_read_positive( - sk->sk_prot->orphan_count); - sk_mem_reclaim(sk); - if (tcp_too_many_orphans(sk, orphan_count)) { + if (tcp_too_many_orphans(sk, 0)) { if (net_ratelimit()) printk(KERN_INFO "TCP: too many of orphaned " "sockets\n"); @@ -2026,6 +2036,7 @@ out: local_bh_enable(); sock_put(sk); } +EXPORT_SYMBOL(tcp_close); /* These states need RST on ABORT according to RFC793 */ @@ -2099,6 +2110,7 @@ int tcp_disconnect(struct sock *sk, int flags) sk->sk_error_report(sk); return err; } +EXPORT_SYMBOL(tcp_disconnect); /* * Socket option code for TCP. @@ -2176,6 +2188,8 @@ static int do_tcp_setsockopt(struct sock *sk, int level, GFP_KERNEL); if (cvp == NULL) return -ENOMEM; + + kref_init(&cvp->kref); } lock_sock(sk); tp->rx_opt.cookie_in_always = @@ -2190,12 +2204,11 @@ static int do_tcp_setsockopt(struct sock *sk, int level, */ kref_put(&tp->cookie_values->kref, tcp_cookie_values_release); - kref_init(&cvp->kref); - tp->cookie_values = cvp; } else { cvp = tp->cookie_values; } } + if (cvp != NULL) { cvp->cookie_desired = ctd.tcpct_cookie_desired; @@ -2209,6 +2222,8 @@ static int do_tcp_setsockopt(struct sock *sk, int level, cvp->s_data_desired = ctd.tcpct_s_data_desired; cvp->s_data_constant = 0; /* false */ } + + tp->cookie_values = cvp; } release_sock(sk); return err; @@ -2377,7 +2392,12 @@ static int do_tcp_setsockopt(struct sock *sk, int level, err = tp->af_specific->md5_parse(sk, optval, optlen); break; #endif - + case TCP_USER_TIMEOUT: + /* Cap the max timeout in ms TCP will retry/retrans + * before giving up and aborting (ETIMEDOUT) a connection. + */ + icsk->icsk_user_timeout = msecs_to_jiffies(val); + break; default: err = -ENOPROTOOPT; break; @@ -2397,6 +2417,7 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, optval, optlen); return do_tcp_setsockopt(sk, level, optname, optval, optlen); } +EXPORT_SYMBOL(tcp_setsockopt); #ifdef CONFIG_COMPAT int compat_tcp_setsockopt(struct sock *sk, int level, int optname, @@ -2407,7 +2428,6 @@ int compat_tcp_setsockopt(struct sock *sk, int level, int optname, optval, optlen); return do_tcp_setsockopt(sk, level, optname, optval, optlen); } - EXPORT_SYMBOL(compat_tcp_setsockopt); #endif @@ -2473,7 +2493,6 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_total_retrans = tp->total_retrans; } - EXPORT_SYMBOL_GPL(tcp_get_info); static int do_tcp_getsockopt(struct sock *sk, int level, @@ -2591,6 +2610,16 @@ static int do_tcp_getsockopt(struct sock *sk, int level, return -EFAULT; return 0; } + case TCP_THIN_LINEAR_TIMEOUTS: + val = tp->thin_lto; + break; + case TCP_THIN_DUPACK: + val = tp->thin_dupack; + break; + + case TCP_USER_TIMEOUT: + val = jiffies_to_msecs(icsk->icsk_user_timeout); + break; default: return -ENOPROTOOPT; } @@ -2612,6 +2641,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, optval, optlen); return do_tcp_getsockopt(sk, level, optname, optval, optlen); } +EXPORT_SYMBOL(tcp_getsockopt); #ifdef CONFIG_COMPAT int compat_tcp_getsockopt(struct sock *sk, int level, int optname, @@ -2622,7 +2652,6 @@ int compat_tcp_getsockopt(struct sock *sk, int level, int optname, optval, optlen); return do_tcp_getsockopt(sk, level, optname, optval, optlen); } - EXPORT_SYMBOL(compat_tcp_getsockopt); #endif @@ -2859,7 +2888,6 @@ void tcp_free_md5sig_pool(void) if (pool) __tcp_free_md5sig_pool(pool); } - EXPORT_SYMBOL(tcp_free_md5sig_pool); static struct tcp_md5sig_pool * __percpu * @@ -2935,7 +2963,6 @@ retry: } return pool; } - EXPORT_SYMBOL(tcp_alloc_md5sig_pool); @@ -2959,7 +2986,7 @@ struct tcp_md5sig_pool *tcp_get_md5sig_pool(void) spin_unlock(&tcp_md5sig_pool_lock); if (p) - return *per_cpu_ptr(p, smp_processor_id()); + return *this_cpu_ptr(p); local_bh_enable(); return NULL; @@ -2987,7 +3014,6 @@ int tcp_md5_hash_header(struct tcp_md5sig_pool *hp, th->check = old_checksum; return err; } - EXPORT_SYMBOL(tcp_md5_hash_header); int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp, @@ -3000,6 +3026,7 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp, const unsigned head_data_len = skb_headlen(skb) > header_len ? skb_headlen(skb) - header_len : 0; const struct skb_shared_info *shi = skb_shinfo(skb); + struct sk_buff *frag_iter; sg_init_table(&sg, 1); @@ -3014,9 +3041,12 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp, return 1; } + skb_walk_frags(skb, frag_iter) + if (tcp_md5_hash_skb_data(hp, frag_iter, 0)) + return 1; + return 0; } - EXPORT_SYMBOL(tcp_md5_hash_skb_data); int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, struct tcp_md5sig_key *key) @@ -3026,7 +3056,6 @@ int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, struct tcp_md5sig_key *key) sg_init_one(&sg, key->key, key->keylen); return crypto_hash_update(&hp->md5_desc, &sg, key->keylen); } - EXPORT_SYMBOL(tcp_md5_hash_key); #endif @@ -3193,7 +3222,7 @@ void __init tcp_init(void) { struct sk_buff *skb = NULL; unsigned long nr_pages, limit; - int order, i, max_share; + int i, max_share, cnt; unsigned long jiffy = jiffies; BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); @@ -3242,22 +3271,12 @@ void __init tcp_init(void) INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain); } - /* Try to be a bit smarter and adjust defaults depending - * on available memory. - */ - for (order = 0; ((1 << order) << PAGE_SHIFT) < - (tcp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket)); - order++) - ; - if (order >= 4) { - tcp_death_row.sysctl_max_tw_buckets = 180000; - sysctl_tcp_max_orphans = 4096 << (order - 4); - sysctl_max_syn_backlog = 1024; - } else if (order < 3) { - tcp_death_row.sysctl_max_tw_buckets >>= (3 - order); - sysctl_tcp_max_orphans >>= (3 - order); - sysctl_max_syn_backlog = 128; - } + + cnt = tcp_hashinfo.ehash_mask + 1; + + tcp_death_row.sysctl_max_tw_buckets = cnt / 2; + sysctl_tcp_max_orphans = cnt / 2; + sysctl_max_syn_backlog = max(128, cnt / 256); /* Set the pressure threshold to be a fraction of global memory that * is up to 1/2 at 256 MB, decreasing toward zero with the amount of @@ -3298,16 +3317,3 @@ void __init tcp_init(void) tcp_secret_retiring = &tcp_secret_two; tcp_secret_secondary = &tcp_secret_two; } - -EXPORT_SYMBOL(tcp_close); -EXPORT_SYMBOL(tcp_disconnect); -EXPORT_SYMBOL(tcp_getsockopt); -EXPORT_SYMBOL(tcp_ioctl); -EXPORT_SYMBOL(tcp_poll); -EXPORT_SYMBOL(tcp_read_sock); -EXPORT_SYMBOL(tcp_recvmsg); -EXPORT_SYMBOL(tcp_sendmsg); -EXPORT_SYMBOL(tcp_splice_read); -EXPORT_SYMBOL(tcp_sendpage); -EXPORT_SYMBOL(tcp_setsockopt); -EXPORT_SYMBOL(tcp_shutdown); diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 0ec9bd0ae94f..850c737e08e2 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -196,10 +196,10 @@ void tcp_get_allowed_congestion_control(char *buf, size_t maxlen) int tcp_set_allowed_congestion_control(char *val) { struct tcp_congestion_ops *ca; - char *clone, *name; + char *saved_clone, *clone, *name; int ret = 0; - clone = kstrdup(val, GFP_USER); + saved_clone = clone = kstrdup(val, GFP_USER); if (!clone) return -ENOMEM; @@ -226,6 +226,7 @@ int tcp_set_allowed_congestion_control(char *val) } out: spin_unlock(&tcp_cong_list_lock); + kfree(saved_clone); return ret; } diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c index 1eba160b72dc..00ca688d8964 100644 --- a/net/ipv4/tcp_illinois.c +++ b/net/ipv4/tcp_illinois.c @@ -6,7 +6,7 @@ * The algorithm is described in: * "TCP-Illinois: A Loss and Delay-Based Congestion Control Algorithm * for High-Speed Networks" - * http://www.ews.uiuc.edu/~shaoliu/papersandslides/liubassri06perf.pdf + * http://www.ifp.illinois.edu/~srikant/Papers/liubassri06perf.pdf * * Implemented from description in paper and ns-2 simulation. * Copyright (C) 2007 Stephen Hemminger <shemminger@linux-foundation.org> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 548d575e6cc6..3357f69e353d 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -78,10 +78,13 @@ int sysctl_tcp_window_scaling __read_mostly = 1; int sysctl_tcp_sack __read_mostly = 1; int sysctl_tcp_fack __read_mostly = 1; int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH; +EXPORT_SYMBOL(sysctl_tcp_reordering); int sysctl_tcp_ecn __read_mostly = 2; +EXPORT_SYMBOL(sysctl_tcp_ecn); int sysctl_tcp_dsack __read_mostly = 1; int sysctl_tcp_app_win __read_mostly = 31; int sysctl_tcp_adv_win_scale __read_mostly = 2; +EXPORT_SYMBOL(sysctl_tcp_adv_win_scale); int sysctl_tcp_stdurg __read_mostly; int sysctl_tcp_rfc1337 __read_mostly; @@ -179,7 +182,7 @@ static void tcp_incr_quickack(struct sock *sk) icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS); } -void tcp_enter_quickack_mode(struct sock *sk) +static void tcp_enter_quickack_mode(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); tcp_incr_quickack(sk); @@ -419,15 +422,16 @@ void tcp_initialize_rcv_mss(struct sock *sk) inet_csk(sk)->icsk_ack.rcv_mss = hint; } +EXPORT_SYMBOL(tcp_initialize_rcv_mss); /* Receiver "autotuning" code. * * The algorithm for RTT estimation w/o timestamps is based on * Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL. - * <http://www.lanl.gov/radiant/website/pubs/drs/lacsi2001.ps> + * <http://public.lanl.gov/radiant/pubs.html#DRS> * * More detail on this code can be found at - * <http://www.psc.edu/~jheffner/senior_thesis.ps>, + * <http://staff.psc.edu/jheffner/>, * though this reference is out of date. A new paper * is pending. */ @@ -801,25 +805,12 @@ void tcp_update_metrics(struct sock *sk) } } -/* Numbers are taken from RFC3390. - * - * John Heffner states: - * - * The RFC specifies a window of no more than 4380 bytes - * unless 2*MSS > 4380. Reading the pseudocode in the RFC - * is a bit misleading because they use a clamp at 4380 bytes - * rather than use a multiplier in the relevant range. - */ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst) { __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); - if (!cwnd) { - if (tp->mss_cache > 1460) - cwnd = 2; - else - cwnd = (tp->mss_cache > 1095) ? 3 : 4; - } + if (!cwnd) + cwnd = rfc3390_bytes_to_packets(tp->mss_cache); return min_t(__u32, cwnd, tp->snd_cwnd_clamp); } @@ -2310,7 +2301,7 @@ static inline int tcp_dupack_heuristics(struct tcp_sock *tp) static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb) { - return (tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto); + return tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto; } static inline int tcp_head_timedout(struct sock *sk) @@ -2504,7 +2495,7 @@ static void tcp_timeout_skbs(struct sock *sk) /* Mark head of queue up as lost. With RFC3517 SACK, the packets is * is against sacked "cnt", otherwise it's against facked "cnt" */ -static void tcp_mark_head_lost(struct sock *sk, int packets) +static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; @@ -2512,13 +2503,13 @@ static void tcp_mark_head_lost(struct sock *sk, int packets) int err; unsigned int mss; - if (packets == 0) - return; - WARN_ON(packets > tp->packets_out); if (tp->lost_skb_hint) { skb = tp->lost_skb_hint; cnt = tp->lost_cnt_hint; + /* Head already handled? */ + if (mark_head && skb != tcp_write_queue_head(sk)) + return; } else { skb = tcp_write_queue_head(sk); cnt = 0; @@ -2541,7 +2532,8 @@ static void tcp_mark_head_lost(struct sock *sk, int packets) cnt += tcp_skb_pcount(skb); if (cnt > packets) { - if (tcp_is_sack(tp) || (oldcnt >= packets)) + if ((tcp_is_sack(tp) && !tcp_is_fack(tp)) || + (oldcnt >= packets)) break; mss = skb_shinfo(skb)->gso_size; @@ -2552,6 +2544,9 @@ static void tcp_mark_head_lost(struct sock *sk, int packets) } tcp_skb_mark_lost(tp, skb); + + if (mark_head) + break; } tcp_verify_left_out(tp); } @@ -2563,17 +2558,18 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit) struct tcp_sock *tp = tcp_sk(sk); if (tcp_is_reno(tp)) { - tcp_mark_head_lost(sk, 1); + tcp_mark_head_lost(sk, 1, 1); } else if (tcp_is_fack(tp)) { int lost = tp->fackets_out - tp->reordering; if (lost <= 0) lost = 1; - tcp_mark_head_lost(sk, lost); + tcp_mark_head_lost(sk, lost, 0); } else { int sacked_upto = tp->sacked_out - tp->reordering; - if (sacked_upto < fast_rexmit) - sacked_upto = fast_rexmit; - tcp_mark_head_lost(sk, sacked_upto); + if (sacked_upto >= 0) + tcp_mark_head_lost(sk, sacked_upto, 0); + else if (fast_rexmit) + tcp_mark_head_lost(sk, 1, 1); } tcp_timeout_skbs(sk); @@ -2882,7 +2878,7 @@ static void tcp_mtup_probe_success(struct sock *sk) icsk->icsk_mtup.probe_size; tp->snd_cwnd_cnt = 0; tp->snd_cwnd_stamp = tcp_time_stamp; - tp->rcv_ssthresh = tcp_current_ssthresh(sk); + tp->snd_ssthresh = tcp_current_ssthresh(sk); icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size; icsk->icsk_mtup.probe_size = 0; @@ -2938,6 +2934,7 @@ void tcp_simple_retransmit(struct sock *sk) } tcp_xmit_retransmit_queue(sk); } +EXPORT_SYMBOL(tcp_simple_retransmit); /* Process an event, which can update packets-in-flight not trivially. * Main goal of this function is to calculate new estimate for left_out, @@ -2978,7 +2975,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag) before(tp->snd_una, tp->high_seq) && icsk->icsk_ca_state != TCP_CA_Open && tp->fackets_out > tp->reordering) { - tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering); + tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering, 0); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSS); } @@ -3286,7 +3283,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, * connection startup slow start one packet too * quickly. This is severely frowned upon behavior. */ - if (!(scb->flags & TCPCB_FLAG_SYN)) { + if (!(scb->flags & TCPHDR_SYN)) { flag |= FLAG_DATA_ACKED; } else { flag |= FLAG_SYN_ACKED; @@ -3406,8 +3403,8 @@ static void tcp_ack_probe(struct sock *sk) static inline int tcp_ack_is_dubious(const struct sock *sk, const int flag) { - return (!(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) || - inet_csk(sk)->icsk_ca_state != TCP_CA_Open); + return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) || + inet_csk(sk)->icsk_ca_state != TCP_CA_Open; } static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag) @@ -3424,9 +3421,9 @@ static inline int tcp_may_update_window(const struct tcp_sock *tp, const u32 ack, const u32 ack_seq, const u32 nwin) { - return (after(ack, tp->snd_una) || + return after(ack, tp->snd_una) || after(ack_seq, tp->snd_wl1) || - (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd)); + (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd); } /* Update our send window. @@ -3858,6 +3855,7 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, } } } +EXPORT_SYMBOL(tcp_parse_options); static int tcp_parse_aligned_timestamp(struct tcp_sock *tp, struct tcphdr *th) { @@ -3924,13 +3922,14 @@ u8 *tcp_parse_md5sig_option(struct tcphdr *th) if (opsize < 2 || opsize > length) return NULL; if (opcode == TCPOPT_MD5SIG) - return ptr; + return opsize == TCPOLEN_MD5SIG ? ptr : NULL; } ptr += opsize - 2; length -= opsize; } return NULL; } +EXPORT_SYMBOL(tcp_parse_md5sig_option); #endif static inline void tcp_store_ts_recent(struct tcp_sock *tp) @@ -4041,6 +4040,8 @@ static void tcp_reset(struct sock *sk) default: sk->sk_err = ECONNRESET; } + /* This barrier is coupled with smp_rmb() in tcp_poll() */ + smp_wmb(); if (!sock_flag(sk, SOCK_DEAD)) sk->sk_error_report(sk); @@ -5432,6 +5433,7 @@ discard: __kfree_skb(skb); return 0; } +EXPORT_SYMBOL(tcp_rcv_established); static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, struct tcphdr *th, unsigned len) @@ -5931,14 +5933,4 @@ discard: } return 0; } - -EXPORT_SYMBOL(sysctl_tcp_ecn); -EXPORT_SYMBOL(sysctl_tcp_reordering); -EXPORT_SYMBOL(sysctl_tcp_adv_win_scale); -EXPORT_SYMBOL(tcp_parse_options); -#ifdef CONFIG_TCP_MD5SIG -EXPORT_SYMBOL(tcp_parse_md5sig_option); -#endif -EXPORT_SYMBOL(tcp_rcv_established); EXPORT_SYMBOL(tcp_rcv_state_process); -EXPORT_SYMBOL(tcp_initialize_rcv_mss); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index fe193e53af44..8f8527d41682 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -84,6 +84,7 @@ int sysctl_tcp_tw_reuse __read_mostly; int sysctl_tcp_low_latency __read_mostly; +EXPORT_SYMBOL(sysctl_tcp_low_latency); #ifdef CONFIG_TCP_MD5SIG @@ -100,6 +101,7 @@ struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr) #endif struct inet_hashinfo tcp_hashinfo; +EXPORT_SYMBOL(tcp_hashinfo); static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb) { @@ -139,7 +141,6 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) return 0; } - EXPORT_SYMBOL_GPL(tcp_twsk_unique); /* This will initiate an outgoing connection. */ @@ -204,10 +205,12 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) * TIME-WAIT * and initialize rx_opt.ts_recent from it, * when trying new connection. */ - if (peer != NULL && - (u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) { - tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp; - tp->rx_opt.ts_recent = peer->tcp_ts; + if (peer) { + inet_peer_refcheck(peer); + if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) { + tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp; + tp->rx_opt.ts_recent = peer->tcp_ts; + } } } @@ -237,7 +240,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) /* OK, now commit destination to socket. */ sk->sk_gso_type = SKB_GSO_TCPV4; - sk_setup_caps(sk, &rt->u.dst); + sk_setup_caps(sk, &rt->dst); if (!tp->write_seq) tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr, @@ -265,6 +268,7 @@ failure: inet->inet_dport = 0; return err; } +EXPORT_SYMBOL(tcp_v4_connect); /* * This routine does path mtu discovery as defined in RFC1191. @@ -543,6 +547,7 @@ void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); } +EXPORT_SYMBOL(tcp_v4_send_check); int tcp_v4_gso_send_check(struct sk_buff *skb) { @@ -793,19 +798,20 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req) kfree(inet_rsk(req)->opt); } -#ifdef CONFIG_SYN_COOKIES -static void syn_flood_warning(struct sk_buff *skb) +static void syn_flood_warning(const struct sk_buff *skb) { - static unsigned long warntime; + const char *msg; - if (time_after(jiffies, (warntime + HZ * 60))) { - warntime = jiffies; - printk(KERN_INFO - "possible SYN flooding on port %d. Sending cookies.\n", - ntohs(tcp_hdr(skb)->dest)); - } -} +#ifdef CONFIG_SYN_COOKIES + if (sysctl_tcp_syncookies) + msg = "Sending cookies"; + else #endif + msg = "Dropping request"; + + pr_info("TCP: Possible SYN flooding on port %d. %s.\n", + ntohs(tcp_hdr(skb)->dest), msg); +} /* * Save and compile IPv4 options into the request_sock if needed. @@ -857,7 +863,6 @@ struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk, { return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr); } - EXPORT_SYMBOL(tcp_v4_md5_lookup); static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk, @@ -924,7 +929,6 @@ int tcp_v4_md5_do_add(struct sock *sk, __be32 addr, } return 0; } - EXPORT_SYMBOL(tcp_v4_md5_do_add); static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk, @@ -962,7 +966,6 @@ int tcp_v4_md5_do_del(struct sock *sk, __be32 addr) } return -ENOENT; } - EXPORT_SYMBOL(tcp_v4_md5_do_del); static void tcp_v4_clear_md5_list(struct sock *sk) @@ -1135,7 +1138,6 @@ clear_hash_noput: memset(md5_hash, 0, 16); return 1; } - EXPORT_SYMBOL(tcp_v4_md5_hash_skb); static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb) @@ -1243,6 +1245,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) * evidently real one. */ if (inet_csk_reqsk_queue_is_full(sk) && !isn) { + if (net_ratelimit()) + syn_flood_warning(skb); #ifdef CONFIG_SYN_COOKIES if (sysctl_tcp_syncookies) { want_cookie = 1; @@ -1323,15 +1327,12 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) if (security_inet_conn_request(sk, skb, req)) goto drop_and_free; - if (!want_cookie) + if (!want_cookie || tmp_opt.tstamp_ok) TCP_ECN_create_request(req, tcp_hdr(skb)); if (want_cookie) { -#ifdef CONFIG_SYN_COOKIES - syn_flood_warning(skb); - req->cookie_ts = tmp_opt.tstamp_ok; -#endif isn = cookie_v4_init_sequence(sk, skb, &req->mss); + req->cookie_ts = tmp_opt.tstamp_ok; } else if (!isn) { struct inet_peer *peer = NULL; @@ -1349,6 +1350,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) (dst = inet_csk_route_req(sk, req)) != NULL && (peer = rt_get_peer((struct rtable *)dst)) != NULL && peer->v4daddr == saddr) { + inet_peer_refcheck(peer); if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL && (s32)(peer->tcp_ts - req->ts_recent) > TCP_PAWS_WINDOW) { @@ -1393,6 +1395,7 @@ drop_and_free: drop: return 0; } +EXPORT_SYMBOL(tcp_v4_conn_request); /* @@ -1419,7 +1422,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newsk = tcp_create_openreq_child(sk, req, skb); if (!newsk) - goto exit; + goto exit_nonewsk; newsk->sk_gso_type = SKB_GSO_TCPV4; sk_setup_caps(newsk, dst); @@ -1466,18 +1469,23 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, } #endif + if (__inet_inherit_port(sk, newsk) < 0) { + sock_put(newsk); + goto exit; + } __inet_hash_nolisten(newsk, NULL); - __inet_inherit_port(sk, newsk); return newsk; exit_overflow: NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); +exit_nonewsk: + dst_release(dst); exit: NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); - dst_release(dst); return NULL; } +EXPORT_SYMBOL(tcp_v4_syn_recv_sock); static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) { @@ -1504,7 +1512,7 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) } #ifdef CONFIG_SYN_COOKIES - if (!th->rst && !th->syn && th->ack) + if (!th->syn) sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt)); #endif return sk; @@ -1607,6 +1615,7 @@ csum_err: TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); goto discard; } +EXPORT_SYMBOL(tcp_v4_do_rcv); /* * From tcp_input.c @@ -1793,6 +1802,7 @@ int tcp_v4_remember_stamp(struct sock *sk) return 0; } +EXPORT_SYMBOL(tcp_v4_remember_stamp); int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw) { @@ -1832,6 +1842,7 @@ const struct inet_connection_sock_af_ops ipv4_specific = { .compat_getsockopt = compat_ip_getsockopt, #endif }; +EXPORT_SYMBOL(ipv4_specific); #ifdef CONFIG_TCP_MD5SIG static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { @@ -1960,7 +1971,6 @@ void tcp_v4_destroy_sock(struct sock *sk) percpu_counter_dec(&tcp_sockets_allocated); } - EXPORT_SYMBOL(tcp_v4_destroy_sock); #ifdef CONFIG_PROC_FS @@ -1978,6 +1988,11 @@ static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw) hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL; } +/* + * Get next listener socket follow cur. If cur is NULL, get first socket + * starting from bucket given in st->bucket; when st->bucket is zero the + * very first socket in the hash table is returned. + */ static void *listening_get_next(struct seq_file *seq, void *cur) { struct inet_connection_sock *icsk; @@ -1988,14 +2003,15 @@ static void *listening_get_next(struct seq_file *seq, void *cur) struct net *net = seq_file_net(seq); if (!sk) { - st->bucket = 0; - ilb = &tcp_hashinfo.listening_hash[0]; + ilb = &tcp_hashinfo.listening_hash[st->bucket]; spin_lock_bh(&ilb->lock); sk = sk_nulls_head(&ilb->head); + st->offset = 0; goto get_sk; } ilb = &tcp_hashinfo.listening_hash[st->bucket]; ++st->num; + ++st->offset; if (st->state == TCP_SEQ_STATE_OPENREQ) { struct request_sock *req = cur; @@ -2010,6 +2026,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur) } req = req->dl_next; } + st->offset = 0; if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries) break; get_req: @@ -2045,6 +2062,7 @@ start_req: read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); } spin_unlock_bh(&ilb->lock); + st->offset = 0; if (++st->bucket < INET_LHTABLE_SIZE) { ilb = &tcp_hashinfo.listening_hash[st->bucket]; spin_lock_bh(&ilb->lock); @@ -2058,7 +2076,12 @@ out: static void *listening_get_idx(struct seq_file *seq, loff_t *pos) { - void *rc = listening_get_next(seq, NULL); + struct tcp_iter_state *st = seq->private; + void *rc; + + st->bucket = 0; + st->offset = 0; + rc = listening_get_next(seq, NULL); while (rc && *pos) { rc = listening_get_next(seq, rc); @@ -2073,13 +2096,18 @@ static inline int empty_bucket(struct tcp_iter_state *st) hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain); } +/* + * Get first established socket starting from bucket given in st->bucket. + * If st->bucket is zero, the very first socket in the hash is returned. + */ static void *established_get_first(struct seq_file *seq) { struct tcp_iter_state *st = seq->private; struct net *net = seq_file_net(seq); void *rc = NULL; - for (st->bucket = 0; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) { + st->offset = 0; + for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) { struct sock *sk; struct hlist_nulls_node *node; struct inet_timewait_sock *tw; @@ -2124,6 +2152,7 @@ static void *established_get_next(struct seq_file *seq, void *cur) struct net *net = seq_file_net(seq); ++st->num; + ++st->offset; if (st->state == TCP_SEQ_STATE_TIME_WAIT) { tw = cur; @@ -2140,6 +2169,7 @@ get_tw: st->state = TCP_SEQ_STATE_ESTABLISHED; /* Look for next non empty bucket */ + st->offset = 0; while (++st->bucket <= tcp_hashinfo.ehash_mask && empty_bucket(st)) ; @@ -2167,7 +2197,11 @@ out: static void *established_get_idx(struct seq_file *seq, loff_t pos) { - void *rc = established_get_first(seq); + struct tcp_iter_state *st = seq->private; + void *rc; + + st->bucket = 0; + rc = established_get_first(seq); while (rc && pos) { rc = established_get_next(seq, rc); @@ -2192,24 +2226,72 @@ static void *tcp_get_idx(struct seq_file *seq, loff_t pos) return rc; } +static void *tcp_seek_last_pos(struct seq_file *seq) +{ + struct tcp_iter_state *st = seq->private; + int offset = st->offset; + int orig_num = st->num; + void *rc = NULL; + + switch (st->state) { + case TCP_SEQ_STATE_OPENREQ: + case TCP_SEQ_STATE_LISTENING: + if (st->bucket >= INET_LHTABLE_SIZE) + break; + st->state = TCP_SEQ_STATE_LISTENING; + rc = listening_get_next(seq, NULL); + while (offset-- && rc) + rc = listening_get_next(seq, rc); + if (rc) + break; + st->bucket = 0; + /* Fallthrough */ + case TCP_SEQ_STATE_ESTABLISHED: + case TCP_SEQ_STATE_TIME_WAIT: + st->state = TCP_SEQ_STATE_ESTABLISHED; + if (st->bucket > tcp_hashinfo.ehash_mask) + break; + rc = established_get_first(seq); + while (offset-- && rc) + rc = established_get_next(seq, rc); + } + + st->num = orig_num; + + return rc; +} + static void *tcp_seq_start(struct seq_file *seq, loff_t *pos) { struct tcp_iter_state *st = seq->private; + void *rc; + + if (*pos && *pos == st->last_pos) { + rc = tcp_seek_last_pos(seq); + if (rc) + goto out; + } + st->state = TCP_SEQ_STATE_LISTENING; st->num = 0; - return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; + st->bucket = 0; + st->offset = 0; + rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; + +out: + st->last_pos = *pos; + return rc; } static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) { + struct tcp_iter_state *st = seq->private; void *rc = NULL; - struct tcp_iter_state *st; if (v == SEQ_START_TOKEN) { rc = tcp_get_idx(seq, 0); goto out; } - st = seq->private; switch (st->state) { case TCP_SEQ_STATE_OPENREQ: @@ -2217,6 +2299,8 @@ static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) rc = listening_get_next(seq, v); if (!rc) { st->state = TCP_SEQ_STATE_ESTABLISHED; + st->bucket = 0; + st->offset = 0; rc = established_get_first(seq); } break; @@ -2227,6 +2311,7 @@ static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) } out: ++*pos; + st->last_pos = *pos; return rc; } @@ -2265,6 +2350,7 @@ static int tcp_seq_open(struct inode *inode, struct file *file) s = ((struct seq_file *)file->private_data)->private; s->family = afinfo->family; + s->last_pos = 0; return 0; } @@ -2288,11 +2374,13 @@ int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo) rc = -ENOMEM; return rc; } +EXPORT_SYMBOL(tcp_proc_register); void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo) { proc_net_remove(net, afinfo->name); } +EXPORT_SYMBOL(tcp_proc_unregister); static void get_openreq4(struct sock *sk, struct request_sock *req, struct seq_file *f, int i, int uid, int *len) @@ -2487,7 +2575,6 @@ struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb) return tcp_gro_receive(head, skb); } -EXPORT_SYMBOL(tcp4_gro_receive); int tcp4_gro_complete(struct sk_buff *skb) { @@ -2500,7 +2587,6 @@ int tcp4_gro_complete(struct sk_buff *skb) return tcp_gro_complete(skb); } -EXPORT_SYMBOL(tcp4_gro_complete); struct proto tcp_prot = { .name = "TCP", @@ -2516,6 +2602,8 @@ struct proto tcp_prot = { .setsockopt = tcp_setsockopt, .getsockopt = tcp_getsockopt, .recvmsg = tcp_recvmsg, + .sendmsg = tcp_sendmsg, + .sendpage = tcp_sendpage, .backlog_rcv = tcp_v4_do_rcv, .hash = inet_hash, .unhash = inet_unhash, @@ -2534,11 +2622,13 @@ struct proto tcp_prot = { .twsk_prot = &tcp_timewait_sock_ops, .rsk_prot = &tcp_request_sock_ops, .h.hashinfo = &tcp_hashinfo, + .no_autobind = true, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_tcp_setsockopt, .compat_getsockopt = compat_tcp_getsockopt, #endif }; +EXPORT_SYMBOL(tcp_prot); static int __net_init tcp_sk_init(struct net *net) @@ -2569,20 +2659,3 @@ void __init tcp_v4_init(void) if (register_pernet_subsys(&tcp_sk_ops)) panic("Failed to create the TCP control socket.\n"); } - -EXPORT_SYMBOL(ipv4_specific); -EXPORT_SYMBOL(tcp_hashinfo); -EXPORT_SYMBOL(tcp_prot); -EXPORT_SYMBOL(tcp_v4_conn_request); -EXPORT_SYMBOL(tcp_v4_connect); -EXPORT_SYMBOL(tcp_v4_do_rcv); -EXPORT_SYMBOL(tcp_v4_remember_stamp); -EXPORT_SYMBOL(tcp_v4_send_check); -EXPORT_SYMBOL(tcp_v4_syn_recv_sock); - -#ifdef CONFIG_PROC_FS -EXPORT_SYMBOL(tcp_proc_register); -EXPORT_SYMBOL(tcp_proc_unregister); -#endif -EXPORT_SYMBOL(sysctl_tcp_low_latency); - diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 794c2e122a41..43cf901d7659 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -47,7 +47,6 @@ struct inet_timewait_death_row tcp_death_row = { .twcal_timer = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0, (unsigned long)&tcp_death_row), }; - EXPORT_SYMBOL_GPL(tcp_death_row); static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) @@ -56,7 +55,7 @@ static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) return 1; if (after(end_seq, s_win) && before(seq, e_win)) return 1; - return (seq == e_win && seq == end_seq); + return seq == e_win && seq == end_seq; } /* @@ -262,6 +261,7 @@ kill: inet_twsk_put(tw); return TCP_TW_SUCCESS; } +EXPORT_SYMBOL(tcp_timewait_state_process); /* * Move a socket to time-wait or dead fin-wait-2 state. @@ -362,7 +362,6 @@ void tcp_twsk_destructor(struct sock *sk) tcp_free_md5sig_pool(); #endif } - EXPORT_SYMBOL_GPL(tcp_twsk_destructor); static inline void TCP_ECN_openreq_child(struct tcp_sock *tp, @@ -510,6 +509,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, } return newsk; } +EXPORT_SYMBOL(tcp_create_openreq_child); /* * Process an incoming packet for SYN_RECV sockets represented @@ -706,6 +706,7 @@ embryonic_reset: inet_csk_reqsk_queue_drop(sk, req, prev); return NULL; } +EXPORT_SYMBOL(tcp_check_req); /* * Queue segment on the new socket if the new socket is active, @@ -737,8 +738,4 @@ int tcp_child_process(struct sock *parent, struct sock *child, sock_put(child); return ret; } - -EXPORT_SYMBOL(tcp_check_req); EXPORT_SYMBOL(tcp_child_process); -EXPORT_SYMBOL(tcp_create_openreq_child); -EXPORT_SYMBOL(tcp_timewait_state_process); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 7ed9dc1042d1..05b1ecf36763 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -224,16 +224,10 @@ void tcp_select_initial_window(int __space, __u32 mss, } } - /* Set initial window to value enough for senders, - * following RFC2414. Senders, not following this RFC, - * will be satisfied with 2. - */ + /* Set initial window to value enough for senders, following RFC5681. */ if (mss > (1 << *rcv_wscale)) { - int init_cwnd = 4; - if (mss > 1460 * 3) - init_cwnd = 2; - else if (mss > 1460) - init_cwnd = 3; + int init_cwnd = rfc3390_bytes_to_packets(mss); + /* when initializing use the value from init_rcv_wnd * rather than the default from above */ @@ -247,6 +241,7 @@ void tcp_select_initial_window(int __space, __u32 mss, /* Set the clamp no higher than max representable value */ (*window_clamp) = min(65535U << (*rcv_wscale), *window_clamp); } +EXPORT_SYMBOL(tcp_select_initial_window); /* Chose a new window to advertise, update state in tcp_sock for the * socket, and return result with RFC1323 scaling applied. The return @@ -294,9 +289,9 @@ static u16 tcp_select_window(struct sock *sk) /* Packet ECN state for a SYN-ACK */ static inline void TCP_ECN_send_synack(struct tcp_sock *tp, struct sk_buff *skb) { - TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_CWR; + TCP_SKB_CB(skb)->flags &= ~TCPHDR_CWR; if (!(tp->ecn_flags & TCP_ECN_OK)) - TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_ECE; + TCP_SKB_CB(skb)->flags &= ~TCPHDR_ECE; } /* Packet ECN state for a SYN. */ @@ -306,7 +301,7 @@ static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb) tp->ecn_flags = 0; if (sysctl_tcp_ecn == 1) { - TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ECE | TCPCB_FLAG_CWR; + TCP_SKB_CB(skb)->flags |= TCPHDR_ECE | TCPHDR_CWR; tp->ecn_flags = TCP_ECN_OK; } } @@ -361,7 +356,7 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) skb_shinfo(skb)->gso_type = 0; TCP_SKB_CB(skb)->seq = seq; - if (flags & (TCPCB_FLAG_SYN | TCPCB_FLAG_FIN)) + if (flags & (TCPHDR_SYN | TCPHDR_FIN)) seq++; TCP_SKB_CB(skb)->end_seq = seq; } @@ -820,7 +815,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, tcb = TCP_SKB_CB(skb); memset(&opts, 0, sizeof(opts)); - if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) + if (unlikely(tcb->flags & TCPHDR_SYN)) tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5); else tcp_options_size = tcp_established_options(sk, skb, &opts, @@ -843,7 +838,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | tcb->flags); - if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) { + if (unlikely(tcb->flags & TCPHDR_SYN)) { /* RFC1323: The window in SYN & SYN/ACK segments * is never scaled. */ @@ -866,7 +861,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, } tcp_options_write((__be32 *)(th + 1), tp, &opts); - if (likely((tcb->flags & TCPCB_FLAG_SYN) == 0)) + if (likely((tcb->flags & TCPHDR_SYN) == 0)) TCP_ECN_send(sk, skb, tcp_header_size); #ifdef CONFIG_TCP_MD5SIG @@ -880,7 +875,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, icsk->icsk_af_ops->send_check(sk, skb); - if (likely(tcb->flags & TCPCB_FLAG_ACK)) + if (likely(tcb->flags & TCPHDR_ACK)) tcp_event_ack_sent(sk, tcp_skb_pcount(skb)); if (skb->len != tcp_header_size) @@ -1023,7 +1018,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, /* PSH and FIN should only be set in the second packet. */ flags = TCP_SKB_CB(skb)->flags; - TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN | TCPCB_FLAG_PSH); + TCP_SKB_CB(skb)->flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH); TCP_SKB_CB(buff)->flags = flags; TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked; @@ -1189,6 +1184,7 @@ void tcp_mtup_init(struct sock *sk) icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, sysctl_tcp_base_mss); icsk->icsk_mtup.probe_size = 0; } +EXPORT_SYMBOL(tcp_mtup_init); /* This function synchronize snd mss to current pmtu/exthdr set. @@ -1232,6 +1228,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) return mss_now; } +EXPORT_SYMBOL(tcp_sync_mss); /* Compute the current effective MSS, taking SACKs and IP options, * and even PMTU discovery events into account. @@ -1328,8 +1325,7 @@ static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, u32 in_flight, cwnd; /* Don't be strict about the congestion window for the final FIN. */ - if ((TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) && - tcp_skb_pcount(skb) == 1) + if ((TCP_SKB_CB(skb)->flags & TCPHDR_FIN) && tcp_skb_pcount(skb) == 1) return 1; in_flight = tcp_packets_in_flight(tp); @@ -1374,9 +1370,9 @@ static inline int tcp_nagle_check(const struct tcp_sock *tp, const struct sk_buff *skb, unsigned mss_now, int nonagle) { - return (skb->len < mss_now && + return skb->len < mss_now && ((nonagle & TCP_NAGLE_CORK) || - (!nonagle && tp->packets_out && tcp_minshall_check(tp)))); + (!nonagle && tp->packets_out && tcp_minshall_check(tp))); } /* Return non-zero if the Nagle test allows this packet to be @@ -1398,7 +1394,7 @@ static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb, * Nagle can be ignored during F-RTO too (see RFC4138). */ if (tcp_urg_mode(tp) || (tp->frto_counter == 2) || - (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)) + (TCP_SKB_CB(skb)->flags & TCPHDR_FIN)) return 1; if (!tcp_nagle_check(tp, skb, cur_mss, nonagle)) @@ -1447,10 +1443,10 @@ int tcp_may_send_now(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb = tcp_send_head(sk); - return (skb && + return skb && tcp_snd_test(sk, skb, tcp_current_mss(sk), (tcp_skb_is_last(sk, skb) ? - tp->nonagle : TCP_NAGLE_PUSH))); + tp->nonagle : TCP_NAGLE_PUSH)); } /* Trim TSO SKB to LEN bytes, put the remaining data into a new packet @@ -1461,7 +1457,7 @@ int tcp_may_send_now(struct sock *sk) * packet has never been sent out before (and thus is not cloned). */ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, - unsigned int mss_now) + unsigned int mss_now, gfp_t gfp) { struct sk_buff *buff; int nlen = skb->len - len; @@ -1471,7 +1467,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, if (skb->len != skb->data_len) return tcp_fragment(sk, skb, len, mss_now); - buff = sk_stream_alloc_skb(sk, 0, GFP_ATOMIC); + buff = sk_stream_alloc_skb(sk, 0, gfp); if (unlikely(buff == NULL)) return -ENOMEM; @@ -1487,7 +1483,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, /* PSH and FIN should only be set in the second packet. */ flags = TCP_SKB_CB(skb)->flags; - TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN | TCPCB_FLAG_PSH); + TCP_SKB_CB(skb)->flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH); TCP_SKB_CB(buff)->flags = flags; /* This packet was never sent out yet, so no SACK bits. */ @@ -1518,7 +1514,7 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) const struct inet_connection_sock *icsk = inet_csk(sk); u32 send_win, cong_win, limit, in_flight; - if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) + if (TCP_SKB_CB(skb)->flags & TCPHDR_FIN) goto send_now; if (icsk->icsk_ca_state != TCP_CA_Open) @@ -1644,7 +1640,7 @@ static int tcp_mtu_probe(struct sock *sk) TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq; TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size; - TCP_SKB_CB(nskb)->flags = TCPCB_FLAG_ACK; + TCP_SKB_CB(nskb)->flags = TCPHDR_ACK; TCP_SKB_CB(nskb)->sacked = 0; nskb->csum = 0; nskb->ip_summed = skb->ip_summed; @@ -1669,7 +1665,7 @@ static int tcp_mtu_probe(struct sock *sk) sk_wmem_free_skb(sk, skb); } else { TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags & - ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH); + ~(TCPHDR_FIN|TCPHDR_PSH); if (!skb_shinfo(skb)->nr_frags) { skb_pull(skb, copy); if (skb->ip_summed != CHECKSUM_PARTIAL) @@ -1769,7 +1765,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, cwnd_quota); if (skb->len > limit && - unlikely(tso_fragment(sk, skb, limit, mss_now))) + unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) break; TCP_SKB_CB(skb)->when = tcp_time_stamp; @@ -2020,7 +2016,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, if (!sysctl_tcp_retrans_collapse) return; - if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) + if (TCP_SKB_CB(skb)->flags & TCPHDR_SYN) return; tcp_for_write_queue_from_safe(skb, tmp, sk) { @@ -2112,7 +2108,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) * since it is cheap to do so and saves bytes on the network. */ if (skb->len > 0 && - (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) && + (TCP_SKB_CB(skb)->flags & TCPHDR_FIN) && tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) { if (!pskb_trim(skb, 0)) { /* Reuse, even though it does some unnecessary work */ @@ -2304,7 +2300,7 @@ void tcp_send_fin(struct sock *sk) mss_now = tcp_current_mss(sk); if (tcp_send_head(sk) != NULL) { - TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN; + TCP_SKB_CB(skb)->flags |= TCPHDR_FIN; TCP_SKB_CB(skb)->end_seq++; tp->write_seq++; } else { @@ -2321,7 +2317,7 @@ void tcp_send_fin(struct sock *sk) skb_reserve(skb, MAX_TCP_HEADER); /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ tcp_init_nondata_skb(skb, tp->write_seq, - TCPCB_FLAG_ACK | TCPCB_FLAG_FIN); + TCPHDR_ACK | TCPHDR_FIN); tcp_queue_skb(sk, skb); } __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF); @@ -2346,7 +2342,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) /* Reserve space for headers and prepare control bits. */ skb_reserve(skb, MAX_TCP_HEADER); tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk), - TCPCB_FLAG_ACK | TCPCB_FLAG_RST); + TCPHDR_ACK | TCPHDR_RST); /* Send it off. */ TCP_SKB_CB(skb)->when = tcp_time_stamp; if (tcp_transmit_skb(sk, skb, 0, priority)) @@ -2366,11 +2362,11 @@ int tcp_send_synack(struct sock *sk) struct sk_buff *skb; skb = tcp_write_queue_head(sk); - if (skb == NULL || !(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN)) { + if (skb == NULL || !(TCP_SKB_CB(skb)->flags & TCPHDR_SYN)) { printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n"); return -EFAULT; } - if (!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_ACK)) { + if (!(TCP_SKB_CB(skb)->flags & TCPHDR_ACK)) { if (skb_cloned(skb)) { struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC); if (nskb == NULL) @@ -2384,7 +2380,7 @@ int tcp_send_synack(struct sock *sk) skb = nskb; } - TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ACK; + TCP_SKB_CB(skb)->flags |= TCPHDR_ACK; TCP_ECN_send_synack(tcp_sk(sk), skb); } TCP_SKB_CB(skb)->when = tcp_time_stamp; @@ -2427,6 +2423,12 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, __u8 rcv_wscale; /* Set this up on the first call only */ req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW); + + /* limit the window selection if the user enforce a smaller rx buffer */ + if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && + (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0)) + req->window_clamp = tcp_full_space(sk); + /* tcp_full_space because it is guaranteed to be the first packet */ tcp_select_initial_window(tcp_full_space(sk), mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), @@ -2463,7 +2465,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, * not even correctly set) */ tcp_init_nondata_skb(skb, tcp_rsk(req)->snt_isn, - TCPCB_FLAG_SYN | TCPCB_FLAG_ACK); + TCPHDR_SYN | TCPHDR_ACK); if (OPTION_COOKIE_EXTENSION & opts.options) { if (s_data_desired) { @@ -2518,6 +2520,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, return skb; } +EXPORT_SYMBOL(tcp_make_synack); /* Do all connect socket setups that can be done AF independent. */ static void tcp_connect_init(struct sock *sk) @@ -2552,6 +2555,11 @@ static void tcp_connect_init(struct sock *sk) tcp_initialize_rcv_mss(sk); + /* limit the window selection if the user enforce a smaller rx buffer */ + if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && + (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0)) + tp->window_clamp = tcp_full_space(sk); + tcp_select_initial_window(tcp_full_space(sk), tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), &tp->rcv_wnd, @@ -2595,7 +2603,7 @@ int tcp_connect(struct sock *sk) skb_reserve(buff, MAX_TCP_HEADER); tp->snd_nxt = tp->write_seq; - tcp_init_nondata_skb(buff, tp->write_seq++, TCPCB_FLAG_SYN); + tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN); TCP_ECN_send_syn(sk, buff); /* Send it off. */ @@ -2620,6 +2628,7 @@ int tcp_connect(struct sock *sk) inet_csk(sk)->icsk_rto, TCP_RTO_MAX); return 0; } +EXPORT_SYMBOL(tcp_connect); /* Send out a delayed ack, the caller does the policy checking * to see if we should even be here. See tcp_input.c:tcp_ack_snd_check() @@ -2701,7 +2710,7 @@ void tcp_send_ack(struct sock *sk) /* Reserve space for headers and prepare control bits. */ skb_reserve(buff, MAX_TCP_HEADER); - tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPCB_FLAG_ACK); + tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK); /* Send it off, this clears delayed acks for us. */ TCP_SKB_CB(buff)->when = tcp_time_stamp; @@ -2735,7 +2744,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent) * end to send an ack. Don't queue or clone SKB, just * send it. */ - tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPCB_FLAG_ACK); + tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK); TCP_SKB_CB(skb)->when = tcp_time_stamp; return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC); } @@ -2765,13 +2774,13 @@ int tcp_write_wakeup(struct sock *sk) if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq || skb->len > mss) { seg_size = min(seg_size, mss); - TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; + TCP_SKB_CB(skb)->flags |= TCPHDR_PSH; if (tcp_fragment(sk, skb, seg_size, mss)) return -1; } else if (!tcp_skb_pcount(skb)) tcp_set_skb_tso_segs(sk, skb, mss); - TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; + TCP_SKB_CB(skb)->flags |= TCPHDR_PSH; TCP_SKB_CB(skb)->when = tcp_time_stamp; err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); if (!err) @@ -2824,10 +2833,3 @@ void tcp_send_probe0(struct sock *sk) TCP_RTO_MAX); } } - -EXPORT_SYMBOL(tcp_select_initial_window); -EXPORT_SYMBOL(tcp_connect); -EXPORT_SYMBOL(tcp_make_synack); -EXPORT_SYMBOL(tcp_simple_retransmit); -EXPORT_SYMBOL(tcp_sync_mss); -EXPORT_SYMBOL(tcp_mtup_init); diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c index f8efada580e8..6211e2114173 100644 --- a/net/ipv4/tcp_probe.c +++ b/net/ipv4/tcp_probe.c @@ -214,6 +214,7 @@ static const struct file_operations tcpprobe_fops = { .owner = THIS_MODULE, .open = tcpprobe_open, .read = tcpprobe_read, + .llseek = noop_llseek, }; static __init int tcpprobe_init(void) diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 440a5c6004f6..74a6aa003657 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -41,7 +41,6 @@ void tcp_init_xmit_timers(struct sock *sk) inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer, &tcp_keepalive_timer); } - EXPORT_SYMBOL(tcp_init_xmit_timers); static void tcp_write_err(struct sock *sk) @@ -67,18 +66,18 @@ static void tcp_write_err(struct sock *sk) static int tcp_out_of_resources(struct sock *sk, int do_reset) { struct tcp_sock *tp = tcp_sk(sk); - int orphans = percpu_counter_read_positive(&tcp_orphan_count); + int shift = 0; /* If peer does not open window for long time, or did not transmit * anything for long time, penalize it. */ if ((s32)(tcp_time_stamp - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset) - orphans <<= 1; + shift++; /* If some dubious ICMP arrived, penalize even more. */ if (sk->sk_err_soft) - orphans <<= 1; + shift++; - if (tcp_too_many_orphans(sk, orphans)) { + if (tcp_too_many_orphans(sk, shift)) { if (net_ratelimit()) printk(KERN_INFO "Out of socket memory\n"); @@ -136,13 +135,16 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) /* This function calculates a "timeout" which is equivalent to the timeout of a * TCP connection after "boundary" unsuccessful, exponentially backed-off - * retransmissions with an initial RTO of TCP_RTO_MIN. + * retransmissions with an initial RTO of TCP_RTO_MIN or TCP_TIMEOUT_INIT if + * syn_set flag is set. */ static bool retransmits_timed_out(struct sock *sk, - unsigned int boundary) + unsigned int boundary, + unsigned int timeout, + bool syn_set) { - unsigned int timeout, linear_backoff_thresh; - unsigned int start_ts; + unsigned int linear_backoff_thresh, start_ts; + unsigned int rto_base = syn_set ? TCP_TIMEOUT_INIT : TCP_RTO_MIN; if (!inet_csk(sk)->icsk_retransmits) return false; @@ -152,14 +154,15 @@ static bool retransmits_timed_out(struct sock *sk, else start_ts = tcp_sk(sk)->retrans_stamp; - linear_backoff_thresh = ilog2(TCP_RTO_MAX/TCP_RTO_MIN); - - if (boundary <= linear_backoff_thresh) - timeout = ((2 << boundary) - 1) * TCP_RTO_MIN; - else - timeout = ((2 << linear_backoff_thresh) - 1) * TCP_RTO_MIN + - (boundary - linear_backoff_thresh) * TCP_RTO_MAX; + if (likely(timeout == 0)) { + linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base); + if (boundary <= linear_backoff_thresh) + timeout = ((2 << boundary) - 1) * rto_base; + else + timeout = ((2 << linear_backoff_thresh) - 1) * rto_base + + (boundary - linear_backoff_thresh) * TCP_RTO_MAX; + } return (tcp_time_stamp - start_ts) >= timeout; } @@ -168,14 +171,15 @@ static int tcp_write_timeout(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); int retry_until; - bool do_reset; + bool do_reset, syn_set = 0; if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { if (icsk->icsk_retransmits) dst_negative_advice(sk); retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; + syn_set = 1; } else { - if (retransmits_timed_out(sk, sysctl_tcp_retries1)) { + if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0, 0)) { /* Black hole detection */ tcp_mtu_probing(icsk, sk); @@ -188,14 +192,15 @@ static int tcp_write_timeout(struct sock *sk) retry_until = tcp_orphan_retries(sk, alive); do_reset = alive || - !retransmits_timed_out(sk, retry_until); + !retransmits_timed_out(sk, retry_until, 0, 0); if (tcp_out_of_resources(sk, do_reset)) return 1; } } - if (retransmits_timed_out(sk, retry_until)) { + if (retransmits_timed_out(sk, retry_until, + syn_set ? 0 : icsk->icsk_user_timeout, syn_set)) { /* Has it gone just too far? */ tcp_write_err(sk); return 1; @@ -362,18 +367,19 @@ void tcp_retransmit_timer(struct sock *sk) if (icsk->icsk_retransmits == 0) { int mib_idx; - if (icsk->icsk_ca_state == TCP_CA_Disorder) { - if (tcp_is_sack(tp)) - mib_idx = LINUX_MIB_TCPSACKFAILURES; - else - mib_idx = LINUX_MIB_TCPRENOFAILURES; - } else if (icsk->icsk_ca_state == TCP_CA_Recovery) { + if (icsk->icsk_ca_state == TCP_CA_Recovery) { if (tcp_is_sack(tp)) mib_idx = LINUX_MIB_TCPSACKRECOVERYFAIL; else mib_idx = LINUX_MIB_TCPRENORECOVERYFAIL; } else if (icsk->icsk_ca_state == TCP_CA_Loss) { mib_idx = LINUX_MIB_TCPLOSSFAILURES; + } else if ((icsk->icsk_ca_state == TCP_CA_Disorder) || + tp->sacked_out) { + if (tcp_is_sack(tp)) + mib_idx = LINUX_MIB_TCPSACKFAILURES; + else + mib_idx = LINUX_MIB_TCPRENOFAILURES; } else { mib_idx = LINUX_MIB_TCPTIMEOUTS; } @@ -437,7 +443,7 @@ out_reset_timer: icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); } inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); - if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1)) + if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0, 0)) __sk_dst_reset(sk); out:; @@ -557,7 +563,14 @@ static void tcp_keepalive_timer (unsigned long data) elapsed = keepalive_time_elapsed(tp); if (elapsed >= keepalive_time_when(tp)) { - if (icsk->icsk_probes_out >= keepalive_probes(tp)) { + /* If the TCP_USER_TIMEOUT option is enabled, use that + * to determine when to timeout instead. + */ + if ((icsk->icsk_user_timeout != 0 && + elapsed >= icsk->icsk_user_timeout && + icsk->icsk_probes_out > 0) || + (icsk->icsk_user_timeout == 0 && + icsk->icsk_probes_out >= keepalive_probes(tp))) { tcp_send_active_reset(sk, GFP_ATOMIC); tcp_write_err(sk); goto out; diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c index b612acf76183..38bc0b52d745 100644 --- a/net/ipv4/tcp_veno.c +++ b/net/ipv4/tcp_veno.c @@ -6,7 +6,7 @@ * "TCP Veno: TCP Enhancement for Transmission over Wireless Access Networks." * IEEE Journal on Selected Areas in Communication, * Feb. 2003. - * See http://www.ntu.edu.sg/home5/ZHOU0022/papers/CPFu03a.pdf + * See http://www.ie.cuhk.edu.hk/fileadmin/staff_upload/soung/Journal/J3.pdf */ #include <linux/mm.h> diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index 20151d6a6241..a534dda5456e 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -80,7 +80,7 @@ static void tcp_westwood_init(struct sock *sk) */ static inline u32 westwood_do_filter(u32 a, u32 b) { - return (((7 * a) + b) >> 3); + return ((7 * a) + b) >> 3; } static void westwood_filter(struct westwood *w, u32 delta) diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c index 3b3813cc80b9..ac3b3ee4b07c 100644 --- a/net/ipv4/tunnel4.c +++ b/net/ipv4/tunnel4.c @@ -14,32 +14,37 @@ #include <net/protocol.h> #include <net/xfrm.h> -static struct xfrm_tunnel *tunnel4_handlers; -static struct xfrm_tunnel *tunnel64_handlers; +static struct xfrm_tunnel __rcu *tunnel4_handlers __read_mostly; +static struct xfrm_tunnel __rcu *tunnel64_handlers __read_mostly; static DEFINE_MUTEX(tunnel4_mutex); -static inline struct xfrm_tunnel **fam_handlers(unsigned short family) +static inline struct xfrm_tunnel __rcu **fam_handlers(unsigned short family) { return (family == AF_INET) ? &tunnel4_handlers : &tunnel64_handlers; } int xfrm4_tunnel_register(struct xfrm_tunnel *handler, unsigned short family) { - struct xfrm_tunnel **pprev; + struct xfrm_tunnel __rcu **pprev; + struct xfrm_tunnel *t; + int ret = -EEXIST; int priority = handler->priority; mutex_lock(&tunnel4_mutex); - for (pprev = fam_handlers(family); *pprev; pprev = &(*pprev)->next) { - if ((*pprev)->priority > priority) + for (pprev = fam_handlers(family); + (t = rcu_dereference_protected(*pprev, + lockdep_is_held(&tunnel4_mutex))) != NULL; + pprev = &t->next) { + if (t->priority > priority) break; - if ((*pprev)->priority == priority) + if (t->priority == priority) goto err; } handler->next = *pprev; - *pprev = handler; + rcu_assign_pointer(*pprev, handler); ret = 0; @@ -48,18 +53,21 @@ err: return ret; } - EXPORT_SYMBOL(xfrm4_tunnel_register); int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family) { - struct xfrm_tunnel **pprev; + struct xfrm_tunnel __rcu **pprev; + struct xfrm_tunnel *t; int ret = -ENOENT; mutex_lock(&tunnel4_mutex); - for (pprev = fam_handlers(family); *pprev; pprev = &(*pprev)->next) { - if (*pprev == handler) { + for (pprev = fam_handlers(family); + (t = rcu_dereference_protected(*pprev, + lockdep_is_held(&tunnel4_mutex))) != NULL; + pprev = &t->next) { + if (t == handler) { *pprev = handler->next; ret = 0; break; @@ -72,9 +80,13 @@ int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family) return ret; } - EXPORT_SYMBOL(xfrm4_tunnel_deregister); +#define for_each_tunnel_rcu(head, handler) \ + for (handler = rcu_dereference(head); \ + handler != NULL; \ + handler = rcu_dereference(handler->next)) \ + static int tunnel4_rcv(struct sk_buff *skb) { struct xfrm_tunnel *handler; @@ -82,7 +94,7 @@ static int tunnel4_rcv(struct sk_buff *skb) if (!pskb_may_pull(skb, sizeof(struct iphdr))) goto drop; - for (handler = tunnel4_handlers; handler; handler = handler->next) + for_each_tunnel_rcu(tunnel4_handlers, handler) if (!handler->handler(skb)) return 0; @@ -101,7 +113,7 @@ static int tunnel64_rcv(struct sk_buff *skb) if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) goto drop; - for (handler = tunnel64_handlers; handler; handler = handler->next) + for_each_tunnel_rcu(tunnel64_handlers, handler) if (!handler->handler(skb)) return 0; @@ -117,7 +129,7 @@ static void tunnel4_err(struct sk_buff *skb, u32 info) { struct xfrm_tunnel *handler; - for (handler = tunnel4_handlers; handler; handler = handler->next) + for_each_tunnel_rcu(tunnel4_handlers, handler) if (!handler->err_handler(skb, info)) break; } @@ -127,7 +139,7 @@ static void tunnel64_err(struct sk_buff *skb, u32 info) { struct xfrm_tunnel *handler; - for (handler = tunnel64_handlers; handler; handler = handler->next) + for_each_tunnel_rcu(tunnel64_handlers, handler) if (!handler->err_handler(skb, info)) break; } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index eec4ff456e33..28cb2d733a3c 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -797,7 +797,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, return -EOPNOTSUPP; ipc.opt = NULL; - ipc.shtx.flags = 0; + ipc.tx_flags = 0; if (up->pending) { /* @@ -845,7 +845,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, ipc.addr = inet->inet_saddr; ipc.oif = sk->sk_bound_dev_if; - err = sock_tx_timestamp(msg, sk, &ipc.shtx); + err = sock_tx_timestamp(sk, &ipc.tx_flags); if (err) return err; if (msg->msg_controllen) { @@ -914,7 +914,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, !sock_flag(sk, SOCK_BROADCAST)) goto out; if (connected) - sk_dst_set(sk, dst_clone(&rt->u.dst)); + sk_dst_set(sk, dst_clone(&rt->dst)); } if (msg->msg_flags&MSG_CONFIRM) @@ -978,7 +978,7 @@ out: return err; do_confirm: - dst_confirm(&rt->u.dst); + dst_confirm(&rt->dst); if (!(msg->msg_flags&MSG_PROBE) || len) goto back_from_confirm; err = 0; @@ -1260,6 +1260,49 @@ void udp_lib_unhash(struct sock *sk) } EXPORT_SYMBOL(udp_lib_unhash); +/* + * inet_rcv_saddr was changed, we must rehash secondary hash + */ +void udp_lib_rehash(struct sock *sk, u16 newhash) +{ + if (sk_hashed(sk)) { + struct udp_table *udptable = sk->sk_prot->h.udp_table; + struct udp_hslot *hslot, *hslot2, *nhslot2; + + hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); + nhslot2 = udp_hashslot2(udptable, newhash); + udp_sk(sk)->udp_portaddr_hash = newhash; + if (hslot2 != nhslot2) { + hslot = udp_hashslot(udptable, sock_net(sk), + udp_sk(sk)->udp_port_hash); + /* we must lock primary chain too */ + spin_lock_bh(&hslot->lock); + + spin_lock(&hslot2->lock); + hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_portaddr_node); + hslot2->count--; + spin_unlock(&hslot2->lock); + + spin_lock(&nhslot2->lock); + hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node, + &nhslot2->head); + nhslot2->count++; + spin_unlock(&nhslot2->lock); + + spin_unlock_bh(&hslot->lock); + } + } +} +EXPORT_SYMBOL(udp_lib_rehash); + +static void udp_v4_rehash(struct sock *sk) +{ + u16 new_hash = udp4_portaddr_hash(sock_net(sk), + inet_sk(sk)->inet_rcv_saddr, + inet_sk(sk)->inet_num); + udp_lib_rehash(sk, new_hash); +} + static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { int rc; @@ -1370,7 +1413,7 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) } } - if (sk->sk_filter) { + if (rcu_dereference_raw(sk->sk_filter)) { if (udp_lib_checksum_complete(skb)) goto drop; } @@ -1843,6 +1886,7 @@ struct proto udp_prot = { .backlog_rcv = __udp_queue_rcv_skb, .hash = udp_lib_hash, .unhash = udp_lib_unhash, + .rehash = udp_v4_rehash, .get_port = udp_v4_get_port, .memory_allocated = &udp_memory_allocated, .sysctl_mem = sysctl_udp_mem, diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index 6610bf76369f..ab76aa928fa9 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -58,6 +58,7 @@ struct proto udplite_prot = { .compat_getsockopt = compat_udp_getsockopt, #endif }; +EXPORT_SYMBOL(udplite_prot); static struct inet_protosw udplite4_protosw = { .type = SOCK_DGRAM, @@ -127,5 +128,3 @@ out_unregister_proto: out_register_err: printk(KERN_CRIT "%s: Cannot add UDP-Lite protocol.\n", __func__); } - -EXPORT_SYMBOL(udplite_prot); diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index ad8fbb871aa0..06814b6216dc 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c @@ -163,5 +163,4 @@ int xfrm4_rcv(struct sk_buff *skb) { return xfrm4_rcv_spi(skb, ip_hdr(skb)->protocol, 0); } - EXPORT_SYMBOL(xfrm4_rcv); diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 23883a48ebfb..4464f3bff6a7 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -37,7 +37,7 @@ static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos, fl.fl4_src = saddr->a4; err = __ip_route_output_key(net, &rt, &fl); - dst = &rt->u.dst; + dst = &rt->dst; if (err) dst = ERR_PTR(err); return dst; @@ -61,7 +61,7 @@ static int xfrm4_get_saddr(struct net *net, static int xfrm4_get_tos(struct flowi *fl) { - return fl->fl4_tos; + return IPTOS_RT_MASK & fl->fl4_tos; /* Strip ECN bits */ } static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst, @@ -174,7 +174,7 @@ static inline int xfrm4_garbage_collect(struct dst_ops *ops) struct net *net = container_of(ops, struct net, xfrm.xfrm4_dst_ops); xfrm4_policy_afinfo.garbage_collect(net); - return (atomic_read(&ops->entries) > ops->gc_thresh * 2); + return (dst_entries_get_slow(ops) > ops->gc_thresh * 2); } static void xfrm4_update_pmtu(struct dst_entry *dst, u32 mtu) @@ -232,7 +232,6 @@ static struct dst_ops xfrm4_dst_ops = { .ifdown = xfrm4_dst_ifdown, .local_out = __ip_local_out, .gc_thresh = 1024, - .entries = ATOMIC_INIT(0), }; static struct xfrm_policy_afinfo xfrm4_policy_afinfo = { @@ -288,6 +287,7 @@ void __init xfrm4_init(int rt_max_size) * and start cleaning when were 1/2 full */ xfrm4_dst_ops.gc_thresh = rt_max_size/2; + dst_entries_init(&xfrm4_dst_ops); xfrm4_state_init(); xfrm4_policy_init(); diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c index 1ef1366a0a03..47947624eccc 100644 --- a/net/ipv4/xfrm4_state.c +++ b/net/ipv4/xfrm4_state.c @@ -21,21 +21,25 @@ static int xfrm4_init_flags(struct xfrm_state *x) } static void -__xfrm4_init_tempsel(struct xfrm_state *x, struct flowi *fl, - struct xfrm_tmpl *tmpl, - xfrm_address_t *daddr, xfrm_address_t *saddr) +__xfrm4_init_tempsel(struct xfrm_selector *sel, struct flowi *fl) +{ + sel->daddr.a4 = fl->fl4_dst; + sel->saddr.a4 = fl->fl4_src; + sel->dport = xfrm_flowi_dport(fl); + sel->dport_mask = htons(0xffff); + sel->sport = xfrm_flowi_sport(fl); + sel->sport_mask = htons(0xffff); + sel->family = AF_INET; + sel->prefixlen_d = 32; + sel->prefixlen_s = 32; + sel->proto = fl->proto; + sel->ifindex = fl->oif; +} + +static void +xfrm4_init_temprop(struct xfrm_state *x, struct xfrm_tmpl *tmpl, + xfrm_address_t *daddr, xfrm_address_t *saddr) { - x->sel.daddr.a4 = fl->fl4_dst; - x->sel.saddr.a4 = fl->fl4_src; - x->sel.dport = xfrm_flowi_dport(fl); - x->sel.dport_mask = htons(0xffff); - x->sel.sport = xfrm_flowi_sport(fl); - x->sel.sport_mask = htons(0xffff); - x->sel.family = AF_INET; - x->sel.prefixlen_d = 32; - x->sel.prefixlen_s = 32; - x->sel.proto = fl->proto; - x->sel.ifindex = fl->oif; x->id = tmpl->id; if (x->id.daddr.a4 == 0) x->id.daddr.a4 = daddr->a4; @@ -70,6 +74,7 @@ static struct xfrm_state_afinfo xfrm4_state_afinfo = { .owner = THIS_MODULE, .init_flags = xfrm4_init_flags, .init_tempsel = __xfrm4_init_tempsel, + .init_temprop = xfrm4_init_temprop, .output = xfrm4_output, .extract_input = xfrm4_extract_input, .extract_output = xfrm4_extract_output, diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c index 41f5982d2087..82806455e859 100644 --- a/net/ipv4/xfrm4_tunnel.c +++ b/net/ipv4/xfrm4_tunnel.c @@ -58,14 +58,14 @@ static int xfrm_tunnel_err(struct sk_buff *skb, u32 info) return -ENOENT; } -static struct xfrm_tunnel xfrm_tunnel_handler = { +static struct xfrm_tunnel xfrm_tunnel_handler __read_mostly = { .handler = xfrm_tunnel_rcv, .err_handler = xfrm_tunnel_err, .priority = 2, }; #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -static struct xfrm_tunnel xfrm64_tunnel_handler = { +static struct xfrm_tunnel xfrm64_tunnel_handler __read_mostly = { .handler = xfrm_tunnel_rcv, .err_handler = xfrm_tunnel_err, .priority = 2, |