diff options
Diffstat (limited to 'net/ipv4')
120 files changed, 5822 insertions, 4965 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 9f9fd2c6f6e2..24e2b7294bf8 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -85,6 +85,13 @@ endchoice config IP_FIB_HASH def_bool ASK_IP_FIB_HASH || !IP_ADVANCED_ROUTER +config IP_FIB_TRIE_STATS + bool "FIB TRIE statistics" + depends on IP_FIB_TRIE + ---help--- + Keep track of statistics on structure of FIB TRIE table. + Useful for testing and measuring TRIE performance. + config IP_MULTIPLE_TABLES bool "IP: policy routing" depends on IP_ADVANCED_ROUTER diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 93fe3966805d..ad40ef3f9ebc 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -10,9 +10,10 @@ obj-y := route.o inetpeer.o protocol.o \ tcp_minisocks.o tcp_cong.o \ datagram.o raw.o udp.o udplite.o \ arp.o icmp.o devinet.o af_inet.o igmp.o \ - sysctl_net_ipv4.o fib_frontend.o fib_semantics.o \ + fib_frontend.o fib_semantics.o \ inet_fragment.o +obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o obj-$(CONFIG_IP_FIB_HASH) += fib_hash.o obj-$(CONFIG_IP_FIB_TRIE) += fib_trie.o obj-$(CONFIG_PROC_FS) += proc.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index d2f22e74b267..09ca5293d08f 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -126,6 +126,10 @@ extern void ip_mc_drop_socket(struct sock *sk); static struct list_head inetsw[SOCK_MAX]; static DEFINE_SPINLOCK(inetsw_lock); +struct ipv4_config ipv4_config; + +EXPORT_SYMBOL(ipv4_config); + /* New destruction routine */ void inet_sock_destruct(struct sock *sk) @@ -135,6 +139,8 @@ void inet_sock_destruct(struct sock *sk) __skb_queue_purge(&sk->sk_receive_queue); __skb_queue_purge(&sk->sk_error_queue); + sk_mem_reclaim(sk); + if (sk->sk_type == SOCK_STREAM && sk->sk_state != TCP_CLOSE) { printk("Attempt to release TCP socket in state %d %p\n", sk->sk_state, sk); @@ -440,7 +446,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (addr_len < sizeof(struct sockaddr_in)) goto out; - chk_addr_ret = inet_addr_type(addr->sin_addr.s_addr); + chk_addr_ret = inet_addr_type(&init_net, addr->sin_addr.s_addr); /* Not specified by any standard per-se, however it breaks too * many applications when removed. It is unfortunate since @@ -789,12 +795,12 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case SIOCADDRT: case SIOCDELRT: case SIOCRTMSG: - err = ip_rt_ioctl(cmd, (void __user *)arg); + err = ip_rt_ioctl(sk->sk_net, cmd, (void __user *)arg); break; case SIOCDARP: case SIOCGARP: case SIOCSARP: - err = arp_ioctl(cmd, (void __user *)arg); + err = arp_ioctl(sk->sk_net, cmd, (void __user *)arg); break; case SIOCGIFADDR: case SIOCSIFADDR: @@ -838,6 +844,7 @@ const struct proto_ops inet_stream_ops = { .recvmsg = sock_common_recvmsg, .mmap = sock_no_mmap, .sendpage = tcp_sendpage, + .splice_read = tcp_splice_read, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_sock_common_setsockopt, .compat_getsockopt = compat_sock_common_getsockopt, @@ -1106,7 +1113,7 @@ int inet_sk_rebuild_header(struct sock *sk) }; security_sk_classify_flow(sk, &fl); - err = ip_route_output_flow(&rt, &fl, sk, 0); + err = ip_route_output_flow(&init_net, &rt, &fl, sk, 0); } if (!err) sk_setup_caps(sk, &rt->u.dst); @@ -1237,7 +1244,7 @@ unsigned long snmp_fold_field(void *mib[], int offt) } EXPORT_SYMBOL_GPL(snmp_fold_field); -int snmp_mib_init(void *ptr[2], size_t mibsize, size_t mibalign) +int snmp_mib_init(void *ptr[2], size_t mibsize) { BUG_ON(ptr == NULL); ptr[0] = __alloc_percpu(mibsize); @@ -1286,37 +1293,31 @@ static struct net_protocol udp_protocol = { static struct net_protocol icmp_protocol = { .handler = icmp_rcv, + .no_policy = 1, }; static int __init init_ipv4_mibs(void) { if (snmp_mib_init((void **)net_statistics, - sizeof(struct linux_mib), - __alignof__(struct linux_mib)) < 0) + sizeof(struct linux_mib)) < 0) goto err_net_mib; if (snmp_mib_init((void **)ip_statistics, - sizeof(struct ipstats_mib), - __alignof__(struct ipstats_mib)) < 0) + sizeof(struct ipstats_mib)) < 0) goto err_ip_mib; if (snmp_mib_init((void **)icmp_statistics, - sizeof(struct icmp_mib), - __alignof__(struct icmp_mib)) < 0) + sizeof(struct icmp_mib)) < 0) goto err_icmp_mib; if (snmp_mib_init((void **)icmpmsg_statistics, - sizeof(struct icmpmsg_mib), - __alignof__(struct icmpmsg_mib)) < 0) + sizeof(struct icmpmsg_mib)) < 0) goto err_icmpmsg_mib; if (snmp_mib_init((void **)tcp_statistics, - sizeof(struct tcp_mib), - __alignof__(struct tcp_mib)) < 0) + sizeof(struct tcp_mib)) < 0) goto err_tcp_mib; if (snmp_mib_init((void **)udp_statistics, - sizeof(struct udp_mib), - __alignof__(struct udp_mib)) < 0) + sizeof(struct udp_mib)) < 0) goto err_udp_mib; if (snmp_mib_init((void **)udplite_statistics, - sizeof(struct udp_mib), - __alignof__(struct udp_mib)) < 0) + sizeof(struct udp_mib)) < 0) goto err_udplite_mib; tcp_mib_init(); @@ -1418,6 +1419,9 @@ static int __init inet_init(void) /* Setup TCP slab cache for open requests. */ tcp_init(); + /* Setup UDP memory threshold */ + udp_init(); + /* Add UDP-Lite (RFC 3828) */ udplite4_register(); @@ -1471,15 +1475,11 @@ static int __init ipv4_proc_init(void) goto out_tcp; if (udp4_proc_init()) goto out_udp; - if (fib_proc_init()) - goto out_fib; if (ip_misc_proc_init()) goto out_misc; out: return rc; out_misc: - fib_proc_exit(); -out_fib: udp4_proc_exit(); out_udp: tcp4_proc_exit(); diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index 5fc346d8b566..d76803a3dcae 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -169,6 +169,8 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) if (ip_clear_mutable_options(iph, &dummy)) goto out; } + + spin_lock(&x->lock); { u8 auth_data[MAX_AH_AUTH_LEN]; @@ -176,13 +178,16 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) skb_push(skb, ihl); err = ah_mac_digest(ahp, skb, ah->auth_data); if (err) - goto out; - err = -EINVAL; - if (memcmp(ahp->work_icv, auth_data, ahp->icv_trunc_len)) { - x->stats.integrity_failed++; - goto out; - } + goto unlock; + if (memcmp(ahp->work_icv, auth_data, ahp->icv_trunc_len)) + err = -EBADMSG; } +unlock: + spin_unlock(&x->lock); + + if (err) + goto out; + skb->network_header += ah_hlen; memcpy(skb_network_header(skb), work_buf, ihl); skb->transport_header = skb->network_header; diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 54a76b8b803a..5976c598cc4b 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -235,8 +235,6 @@ static int arp_constructor(struct neighbour *neigh) struct in_device *in_dev; struct neigh_parms *parms; - neigh->type = inet_addr_type(addr); - rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); if (in_dev == NULL) { @@ -244,6 +242,8 @@ static int arp_constructor(struct neighbour *neigh) return -EINVAL; } + neigh->type = inet_addr_type(&init_net, addr); + parms = in_dev->arp_parms; __neigh_parms_put(neigh->parms); neigh->parms = neigh_parms_clone(parms); @@ -341,14 +341,14 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) switch (IN_DEV_ARP_ANNOUNCE(in_dev)) { default: case 0: /* By default announce any local IP */ - if (skb && inet_addr_type(ip_hdr(skb)->saddr) == RTN_LOCAL) + if (skb && inet_addr_type(&init_net, ip_hdr(skb)->saddr) == RTN_LOCAL) saddr = ip_hdr(skb)->saddr; break; case 1: /* Restrict announcements of saddr in same subnet */ if (!skb) break; saddr = ip_hdr(skb)->saddr; - if (inet_addr_type(saddr) == RTN_LOCAL) { + if (inet_addr_type(&init_net, saddr) == RTN_LOCAL) { /* saddr should be known to target */ if (inet_addr_onlink(in_dev, target, saddr)) break; @@ -382,8 +382,7 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) read_unlock_bh(&neigh->lock); } -static int arp_ignore(struct in_device *in_dev, struct net_device *dev, - __be32 sip, __be32 tip) +static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip) { int scope; @@ -403,7 +402,6 @@ static int arp_ignore(struct in_device *in_dev, struct net_device *dev, case 3: /* Do not reply for scope host addresses */ sip = 0; scope = RT_SCOPE_LINK; - dev = NULL; break; case 4: /* Reserved */ case 5: @@ -415,7 +413,7 @@ static int arp_ignore(struct in_device *in_dev, struct net_device *dev, default: return 0; } - return !inet_confirm_addr(dev, sip, tip, scope); + return !inet_confirm_addr(in_dev, sip, tip, scope); } static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev) @@ -426,7 +424,7 @@ static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev) int flag = 0; /*unsigned long now; */ - if (ip_route_output_key(&rt, &fl) < 0) + if (ip_route_output_key(&init_net, &rt, &fl) < 0) return 1; if (rt->u.dst.dev != dev) { NET_INC_STATS_BH(LINUX_MIB_ARPFILTER); @@ -479,7 +477,7 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb) paddr = ((struct rtable*)skb->dst)->rt_gateway; - if (arp_set_predefined(inet_addr_type(paddr), haddr, paddr, dev)) + if (arp_set_predefined(inet_addr_type(&init_net, paddr), haddr, paddr, dev)) return 0; n = __neigh_lookup(&arp_tbl, &paddr, dev, 1); @@ -777,7 +775,7 @@ static int arp_process(struct sk_buff *skb) * Check for bad requests for 127.x.x.x and requests for multicast * addresses. If this is one such, delete it. */ - if (LOOPBACK(tip) || MULTICAST(tip)) + if (ipv4_is_loopback(tip) || ipv4_is_multicast(tip)) goto out; /* @@ -806,8 +804,8 @@ static int arp_process(struct sk_buff *skb) /* Special case: IPv4 duplicate address detection packet (RFC2131) */ if (sip == 0) { if (arp->ar_op == htons(ARPOP_REQUEST) && - inet_addr_type(tip) == RTN_LOCAL && - !arp_ignore(in_dev,dev,sip,tip)) + inet_addr_type(&init_net, tip) == RTN_LOCAL && + !arp_ignore(in_dev, sip, tip)) arp_send(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha, dev->dev_addr, sha); goto out; @@ -825,7 +823,7 @@ static int arp_process(struct sk_buff *skb) int dont_send = 0; if (!dont_send) - dont_send |= arp_ignore(in_dev,dev,sip,tip); + dont_send |= arp_ignore(in_dev,sip,tip); if (!dont_send && IN_DEV_ARPFILTER(in_dev)) dont_send |= arp_filter(sip,tip,dev); if (!dont_send) @@ -835,9 +833,8 @@ static int arp_process(struct sk_buff *skb) } goto out; } else if (IN_DEV_FORWARD(in_dev)) { - if ((rt->rt_flags&RTCF_DNAT) || - (addr_type == RTN_UNICAST && rt->u.dst.dev != dev && - (arp_fwd_proxy(in_dev, rt) || pneigh_lookup(&arp_tbl, &tip, dev, 0)))) { + if (addr_type == RTN_UNICAST && rt->u.dst.dev != dev && + (arp_fwd_proxy(in_dev, rt) || pneigh_lookup(&arp_tbl, &init_net, &tip, dev, 0))) { n = neigh_event_ns(&arp_tbl, sha, &sip, dev); if (n) neigh_release(n); @@ -860,14 +857,14 @@ static int arp_process(struct sk_buff *skb) n = __neigh_lookup(&arp_tbl, &sip, dev, 0); - if (IPV4_DEVCONF_ALL(ARP_ACCEPT)) { + if (IPV4_DEVCONF_ALL(dev->nd_net, ARP_ACCEPT)) { /* Unsolicited ARP is not accepted by default. It is possible, that this option should be enabled for some devices (strip is candidate) */ if (n == NULL && arp->ar_op == htons(ARPOP_REPLY) && - inet_addr_type(sip) == RTN_UNICAST) + inet_addr_type(&init_net, sip) == RTN_UNICAST) n = __neigh_lookup(&arp_tbl, &sip, dev, 1); } @@ -952,44 +949,60 @@ out_of_mem: * Set (create) an ARP cache entry. */ -static int arp_req_set(struct arpreq *r, struct net_device * dev) +static int arp_req_set_proxy(struct net *net, struct net_device *dev, int on) { - __be32 ip = ((struct sockaddr_in *) &r->arp_pa)->sin_addr.s_addr; + if (dev == NULL) { + IPV4_DEVCONF_ALL(net, PROXY_ARP) = on; + return 0; + } + if (__in_dev_get_rtnl(dev)) { + IN_DEV_CONF_SET(__in_dev_get_rtnl(dev), PROXY_ARP, on); + return 0; + } + return -ENXIO; +} + +static int arp_req_set_public(struct net *net, struct arpreq *r, + struct net_device *dev) +{ + __be32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; + __be32 mask = ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr; + + if (mask && mask != htonl(0xFFFFFFFF)) + return -EINVAL; + if (!dev && (r->arp_flags & ATF_COM)) { + dev = dev_getbyhwaddr(net, r->arp_ha.sa_family, + r->arp_ha.sa_data); + if (!dev) + return -ENODEV; + } + if (mask) { + if (pneigh_lookup(&arp_tbl, net, &ip, dev, 1) == NULL) + return -ENOBUFS; + return 0; + } + + return arp_req_set_proxy(net, dev, 1); +} + +static int arp_req_set(struct net *net, struct arpreq *r, + struct net_device * dev) +{ + __be32 ip; struct neighbour *neigh; int err; - if (r->arp_flags&ATF_PUBL) { - __be32 mask = ((struct sockaddr_in *) &r->arp_netmask)->sin_addr.s_addr; - if (mask && mask != htonl(0xFFFFFFFF)) - return -EINVAL; - if (!dev && (r->arp_flags & ATF_COM)) { - dev = dev_getbyhwaddr(&init_net, r->arp_ha.sa_family, r->arp_ha.sa_data); - if (!dev) - return -ENODEV; - } - if (mask) { - if (pneigh_lookup(&arp_tbl, &ip, dev, 1) == NULL) - return -ENOBUFS; - return 0; - } - if (dev == NULL) { - IPV4_DEVCONF_ALL(PROXY_ARP) = 1; - return 0; - } - if (__in_dev_get_rtnl(dev)) { - IN_DEV_CONF_SET(__in_dev_get_rtnl(dev), PROXY_ARP, 1); - return 0; - } - return -ENXIO; - } + if (r->arp_flags & ATF_PUBL) + return arp_req_set_public(net, r, dev); + ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; if (r->arp_flags & ATF_PERM) r->arp_flags |= ATF_COM; if (dev == NULL) { struct flowi fl = { .nl_u = { .ip4_u = { .daddr = ip, .tos = RTO_ONLINK } } }; struct rtable * rt; - if ((err = ip_route_output_key(&rt, &fl)) != 0) + if ((err = ip_route_output_key(net, &rt, &fl)) != 0) return err; dev = rt->u.dst.dev; ip_rt_put(rt); @@ -1066,37 +1079,37 @@ static int arp_req_get(struct arpreq *r, struct net_device *dev) return err; } -static int arp_req_delete(struct arpreq *r, struct net_device * dev) +static int arp_req_delete_public(struct net *net, struct arpreq *r, + struct net_device *dev) +{ + __be32 ip = ((struct sockaddr_in *) &r->arp_pa)->sin_addr.s_addr; + __be32 mask = ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr; + + if (mask == htonl(0xFFFFFFFF)) + return pneigh_delete(&arp_tbl, net, &ip, dev); + + if (mask) + return -EINVAL; + + return arp_req_set_proxy(net, dev, 0); +} + +static int arp_req_delete(struct net *net, struct arpreq *r, + struct net_device * dev) { int err; - __be32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; + __be32 ip; struct neighbour *neigh; - if (r->arp_flags & ATF_PUBL) { - __be32 mask = - ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr; - if (mask == htonl(0xFFFFFFFF)) - return pneigh_delete(&arp_tbl, &ip, dev); - if (mask == 0) { - if (dev == NULL) { - IPV4_DEVCONF_ALL(PROXY_ARP) = 0; - return 0; - } - if (__in_dev_get_rtnl(dev)) { - IN_DEV_CONF_SET(__in_dev_get_rtnl(dev), - PROXY_ARP, 0); - return 0; - } - return -ENXIO; - } - return -EINVAL; - } + if (r->arp_flags & ATF_PUBL) + return arp_req_delete_public(net, r, dev); + ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; if (dev == NULL) { struct flowi fl = { .nl_u = { .ip4_u = { .daddr = ip, .tos = RTO_ONLINK } } }; struct rtable * rt; - if ((err = ip_route_output_key(&rt, &fl)) != 0) + if ((err = ip_route_output_key(net, &rt, &fl)) != 0) return err; dev = rt->u.dst.dev; ip_rt_put(rt); @@ -1119,7 +1132,7 @@ static int arp_req_delete(struct arpreq *r, struct net_device * dev) * Handle an ARP layer I/O control request. */ -int arp_ioctl(unsigned int cmd, void __user *arg) +int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg) { int err; struct arpreq r; @@ -1151,7 +1164,7 @@ int arp_ioctl(unsigned int cmd, void __user *arg) rtnl_lock(); if (r.arp_dev[0]) { err = -ENODEV; - if ((dev = __dev_get_by_name(&init_net, r.arp_dev)) == NULL) + if ((dev = __dev_get_by_name(net, r.arp_dev)) == NULL) goto out; /* Mmmm... It is wrong... ARPHRD_NETROM==0 */ @@ -1167,10 +1180,10 @@ int arp_ioctl(unsigned int cmd, void __user *arg) switch (cmd) { case SIOCDARP: - err = arp_req_delete(&r, dev); + err = arp_req_delete(net, &r, dev); break; case SIOCSARP: - err = arp_req_set(&r, dev); + err = arp_req_set(net, &r, dev); break; case SIOCGARP: err = arp_req_get(&r, dev); @@ -1359,8 +1372,8 @@ static const struct seq_operations arp_seq_ops = { static int arp_seq_open(struct inode *inode, struct file *file) { - return seq_open_private(file, &arp_seq_ops, - sizeof(struct neigh_seq_state)); + return seq_open_net(inode, file, &arp_seq_ops, + sizeof(struct neigh_seq_state)); } static const struct file_operations arp_seq_fops = { @@ -1368,7 +1381,7 @@ static const struct file_operations arp_seq_fops = { .open = arp_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = seq_release_net, }; static int __init arp_proc_init(void) diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index f18e88bc86ec..d4dc4eb48d95 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -63,7 +63,7 @@ struct cipso_v4_domhsh_entry { * probably be turned into a hash table or something similar so we * can do quick lookups. */ static DEFINE_SPINLOCK(cipso_v4_doi_list_lock); -static struct list_head cipso_v4_doi_list = LIST_HEAD_INIT(cipso_v4_doi_list); +static LIST_HEAD(cipso_v4_doi_list); /* Label mapping cache */ int cipso_v4_cache_enabled = 1; diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index 0301dd468cf4..0c0c73f368ce 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -40,7 +40,7 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) oif = sk->sk_bound_dev_if; saddr = inet->saddr; - if (MULTICAST(usin->sin_addr.s_addr)) { + if (ipv4_is_multicast(usin->sin_addr.s_addr)) { if (!oif) oif = inet->mc_index; if (!saddr) diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index b42f74617bac..21f71bf912d5 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -62,6 +62,7 @@ #include <net/route.h> #include <net/ip_fib.h> #include <net/rtnetlink.h> +#include <net/net_namespace.h> struct ipv4_devconf ipv4_devconf = { .data = { @@ -82,7 +83,8 @@ static struct ipv4_devconf ipv4_devconf_dflt = { }, }; -#define IPV4_DEVCONF_DFLT(attr) IPV4_DEVCONF(ipv4_devconf_dflt, attr) +#define IPV4_DEVCONF_DFLT(net, attr) \ + IPV4_DEVCONF((*net->ipv4.devconf_dflt), attr) static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = { [IFA_LOCAL] = { .type = NLA_U32 }, @@ -98,9 +100,15 @@ static BLOCKING_NOTIFIER_HEAD(inetaddr_chain); static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, int destroy); #ifdef CONFIG_SYSCTL -static void devinet_sysctl_register(struct in_device *in_dev, - struct ipv4_devconf *p); -static void devinet_sysctl_unregister(struct ipv4_devconf *p); +static void devinet_sysctl_register(struct in_device *idev); +static void devinet_sysctl_unregister(struct in_device *idev); +#else +static inline void devinet_sysctl_register(struct in_device *idev) +{ +} +static inline void devinet_sysctl_unregister(struct in_device *idev) +{ +} #endif /* Locks all the inet devices. */ @@ -157,24 +165,18 @@ static struct in_device *inetdev_init(struct net_device *dev) if (!in_dev) goto out; INIT_RCU_HEAD(&in_dev->rcu_head); - memcpy(&in_dev->cnf, &ipv4_devconf_dflt, sizeof(in_dev->cnf)); + memcpy(&in_dev->cnf, dev->nd_net->ipv4.devconf_dflt, + sizeof(in_dev->cnf)); in_dev->cnf.sysctl = NULL; in_dev->dev = dev; if ((in_dev->arp_parms = neigh_parms_alloc(dev, &arp_tbl)) == NULL) goto out_kfree; /* Reference in_dev->dev */ dev_hold(dev); -#ifdef CONFIG_SYSCTL - neigh_sysctl_register(dev, in_dev->arp_parms, NET_IPV4, - NET_IPV4_NEIGH, "ipv4", NULL, NULL); -#endif - /* Account for reference dev->ip_ptr (below) */ in_dev_hold(in_dev); -#ifdef CONFIG_SYSCTL - devinet_sysctl_register(in_dev, &in_dev->cnf); -#endif + devinet_sysctl_register(in_dev); ip_mc_init_dev(in_dev); if (dev->flags & IFF_UP) ip_mc_up(in_dev); @@ -213,15 +215,9 @@ static void inetdev_destroy(struct in_device *in_dev) inet_free_ifa(ifa); } -#ifdef CONFIG_SYSCTL - devinet_sysctl_unregister(&in_dev->cnf); -#endif - dev->ip_ptr = NULL; -#ifdef CONFIG_SYSCTL - neigh_sysctl_unregister(in_dev->arp_parms); -#endif + devinet_sysctl_unregister(in_dev); neigh_parms_release(&arp_tbl, in_dev->arp_parms); arp_ifdown(dev); @@ -408,17 +404,17 @@ static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa) in_dev_hold(in_dev); ifa->ifa_dev = in_dev; } - if (LOOPBACK(ifa->ifa_local)) + if (ipv4_is_loopback(ifa->ifa_local)) ifa->ifa_scope = RT_SCOPE_HOST; return inet_insert_ifa(ifa); } -struct in_device *inetdev_by_index(int ifindex) +struct in_device *inetdev_by_index(struct net *net, int ifindex) { struct net_device *dev; struct in_device *in_dev = NULL; read_lock(&dev_base_lock); - dev = __dev_get_by_index(&init_net, ifindex); + dev = __dev_get_by_index(net, ifindex); if (dev) in_dev = in_dev_get(dev); read_unlock(&dev_base_lock); @@ -441,6 +437,7 @@ struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix, static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) { + struct net *net = skb->sk->sk_net; struct nlattr *tb[IFA_MAX+1]; struct in_device *in_dev; struct ifaddrmsg *ifm; @@ -449,12 +446,15 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg ASSERT_RTNL(); + if (net != &init_net) + return -EINVAL; + err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy); if (err < 0) goto errout; ifm = nlmsg_data(nlh); - in_dev = inetdev_by_index(ifm->ifa_index); + in_dev = inetdev_by_index(net, ifm->ifa_index); if (in_dev == NULL) { err = -ENODEV; goto errout; @@ -560,10 +560,14 @@ errout: static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) { + struct net *net = skb->sk->sk_net; struct in_ifaddr *ifa; ASSERT_RTNL(); + if (net != &init_net) + return -EINVAL; + ifa = rtm_to_ifaddr(nlh); if (IS_ERR(ifa)) return PTR_ERR(ifa); @@ -579,7 +583,7 @@ static __inline__ int inet_abc_len(__be32 addr) { int rc = -1; /* Something else, probably a multicast. */ - if (ZERONET(addr)) + if (ipv4_is_zeronet(addr)) rc = 0; else { __u32 haddr = ntohl(addr); @@ -964,28 +968,25 @@ static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst, /* * Confirm that local IP address exists using wildcards: - * - dev: only on this interface, 0=any interface + * - in_dev: only on this interface, 0=any interface * - dst: only in the same subnet as dst, 0=any dst * - local: address, 0=autoselect the local address * - scope: maximum allowed scope value for the local address */ -__be32 inet_confirm_addr(const struct net_device *dev, __be32 dst, __be32 local, int scope) +__be32 inet_confirm_addr(struct in_device *in_dev, + __be32 dst, __be32 local, int scope) { __be32 addr = 0; - struct in_device *in_dev; - - if (dev) { - rcu_read_lock(); - if ((in_dev = __in_dev_get_rcu(dev))) - addr = confirm_addr_indev(in_dev, dst, local, scope); - rcu_read_unlock(); + struct net_device *dev; + struct net *net; - return addr; - } + if (scope != RT_SCOPE_LINK) + return confirm_addr_indev(in_dev, dst, local, scope); + net = in_dev->dev->nd_net; read_lock(&dev_base_lock); rcu_read_lock(); - for_each_netdev(&init_net, dev) { + for_each_netdev(net, dev) { if ((in_dev = __in_dev_get_rcu(dev))) { addr = confirm_addr_indev(in_dev, dst, local, scope); if (addr) @@ -1106,13 +1107,8 @@ static int inetdev_event(struct notifier_block *this, unsigned long event, */ inetdev_changename(dev, in_dev); -#ifdef CONFIG_SYSCTL - devinet_sysctl_unregister(&in_dev->cnf); - neigh_sysctl_unregister(in_dev->arp_parms); - neigh_sysctl_register(dev, in_dev->arp_parms, NET_IPV4, - NET_IPV4_NEIGH, "ipv4", NULL, NULL); - devinet_sysctl_register(in_dev, &in_dev->cnf); -#endif + devinet_sysctl_unregister(in_dev); + devinet_sysctl_register(in_dev); break; } out: @@ -1174,12 +1170,16 @@ nla_put_failure: static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) { + struct net *net = skb->sk->sk_net; int idx, ip_idx; struct net_device *dev; struct in_device *in_dev; struct in_ifaddr *ifa; int s_ip_idx, s_idx = cb->args[0]; + if (net != &init_net) + return 0; + s_ip_idx = ip_idx = cb->args[1]; idx = 0; for_each_netdev(&init_net, dev) { @@ -1228,28 +1228,50 @@ static void rtmsg_ifa(int event, struct in_ifaddr* ifa, struct nlmsghdr *nlh, kfree_skb(skb); goto errout; } - err = rtnl_notify(skb, pid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL); + err = rtnl_notify(skb, &init_net, pid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL); errout: if (err < 0) - rtnl_set_sk_err(RTNLGRP_IPV4_IFADDR, err); + rtnl_set_sk_err(&init_net, RTNLGRP_IPV4_IFADDR, err); } #ifdef CONFIG_SYSCTL -static void devinet_copy_dflt_conf(int i) +static void devinet_copy_dflt_conf(struct net *net, int i) { struct net_device *dev; read_lock(&dev_base_lock); - for_each_netdev(&init_net, dev) { + for_each_netdev(net, dev) { struct in_device *in_dev; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); if (in_dev && !test_bit(i, in_dev->cnf.state)) - in_dev->cnf.data[i] = ipv4_devconf_dflt.data[i]; + in_dev->cnf.data[i] = net->ipv4.devconf_dflt->data[i]; + rcu_read_unlock(); + } + read_unlock(&dev_base_lock); +} + +static void inet_forward_change(struct net *net) +{ + struct net_device *dev; + int on = IPV4_DEVCONF_ALL(net, FORWARDING); + + IPV4_DEVCONF_ALL(net, ACCEPT_REDIRECTS) = !on; + IPV4_DEVCONF_DFLT(net, FORWARDING) = on; + + read_lock(&dev_base_lock); + for_each_netdev(net, dev) { + struct in_device *in_dev; + rcu_read_lock(); + in_dev = __in_dev_get_rcu(dev); + if (in_dev) + IN_DEV_CONF_SET(in_dev, FORWARDING, on); rcu_read_unlock(); } read_unlock(&dev_base_lock); + + rt_cache_flush(0); } static int devinet_conf_proc(ctl_table *ctl, int write, @@ -1260,12 +1282,13 @@ static int devinet_conf_proc(ctl_table *ctl, int write, if (write) { struct ipv4_devconf *cnf = ctl->extra1; + struct net *net = ctl->extra2; int i = (int *)ctl->data - cnf->data; set_bit(i, cnf->state); - if (cnf == &ipv4_devconf_dflt) - devinet_copy_dflt_conf(i); + if (cnf == net->ipv4.devconf_dflt) + devinet_copy_dflt_conf(net, i); } return ret; @@ -1276,6 +1299,7 @@ static int devinet_conf_sysctl(ctl_table *table, int __user *name, int nlen, void __user *newval, size_t newlen) { struct ipv4_devconf *cnf; + struct net *net; int *valp = table->data; int new; int i; @@ -1311,38 +1335,17 @@ static int devinet_conf_sysctl(ctl_table *table, int __user *name, int nlen, *valp = new; cnf = table->extra1; + net = table->extra2; i = (int *)table->data - cnf->data; set_bit(i, cnf->state); - if (cnf == &ipv4_devconf_dflt) - devinet_copy_dflt_conf(i); + if (cnf == net->ipv4.devconf_dflt) + devinet_copy_dflt_conf(net, i); return 1; } -void inet_forward_change(void) -{ - struct net_device *dev; - int on = IPV4_DEVCONF_ALL(FORWARDING); - - IPV4_DEVCONF_ALL(ACCEPT_REDIRECTS) = !on; - IPV4_DEVCONF_DFLT(FORWARDING) = on; - - read_lock(&dev_base_lock); - for_each_netdev(&init_net, dev) { - struct in_device *in_dev; - rcu_read_lock(); - in_dev = __in_dev_get_rcu(dev); - if (in_dev) - IN_DEV_CONF_SET(in_dev, FORWARDING, on); - rcu_read_unlock(); - } - read_unlock(&dev_base_lock); - - rt_cache_flush(0); -} - static int devinet_sysctl_forward(ctl_table *ctl, int write, struct file* filp, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -1352,9 +1355,11 @@ static int devinet_sysctl_forward(ctl_table *ctl, int write, int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); if (write && *valp != val) { - if (valp == &IPV4_DEVCONF_ALL(FORWARDING)) - inet_forward_change(); - else if (valp != &IPV4_DEVCONF_DFLT(FORWARDING)) + struct net *net = ctl->extra2; + + if (valp == &IPV4_DEVCONF_ALL(net, FORWARDING)) + inet_forward_change(net); + else if (valp != &IPV4_DEVCONF_DFLT(net, FORWARDING)) rt_cache_flush(0); } @@ -1419,11 +1424,8 @@ int ipv4_doint_and_flush_strategy(ctl_table *table, int __user *name, int nlen, static struct devinet_sysctl_table { struct ctl_table_header *sysctl_header; - ctl_table devinet_vars[__NET_IPV4_CONF_MAX]; - ctl_table devinet_dev[2]; - ctl_table devinet_conf_dir[2]; - ctl_table devinet_proto_dir[2]; - ctl_table devinet_root_dir[2]; + struct ctl_table devinet_vars[__NET_IPV4_CONF_MAX]; + char *dev_name; } devinet_sysctl = { .devinet_vars = { DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding", @@ -1455,62 +1457,32 @@ static struct devinet_sysctl_table { DEVINET_SYSCTL_FLUSHING_ENTRY(PROMOTE_SECONDARIES, "promote_secondaries"), }, - .devinet_dev = { - { - .ctl_name = NET_PROTO_CONF_ALL, - .procname = "all", - .mode = 0555, - .child = devinet_sysctl.devinet_vars, - }, - }, - .devinet_conf_dir = { - { - .ctl_name = NET_IPV4_CONF, - .procname = "conf", - .mode = 0555, - .child = devinet_sysctl.devinet_dev, - }, - }, - .devinet_proto_dir = { - { - .ctl_name = NET_IPV4, - .procname = "ipv4", - .mode = 0555, - .child = devinet_sysctl.devinet_conf_dir, - }, - }, - .devinet_root_dir = { - { - .ctl_name = CTL_NET, - .procname = "net", - .mode = 0555, - .child = devinet_sysctl.devinet_proto_dir, - }, - }, }; -static void devinet_sysctl_register(struct in_device *in_dev, - struct ipv4_devconf *p) +static int __devinet_sysctl_register(struct net *net, char *dev_name, + int ctl_name, struct ipv4_devconf *p) { int i; - struct net_device *dev = in_dev ? in_dev->dev : NULL; - struct devinet_sysctl_table *t = kmemdup(&devinet_sysctl, sizeof(*t), - GFP_KERNEL); - char *dev_name = NULL; + struct devinet_sysctl_table *t; +#define DEVINET_CTL_PATH_DEV 3 + + struct ctl_path devinet_ctl_path[] = { + { .procname = "net", .ctl_name = CTL_NET, }, + { .procname = "ipv4", .ctl_name = NET_IPV4, }, + { .procname = "conf", .ctl_name = NET_IPV4_CONF, }, + { /* to be set */ }, + { }, + }; + + t = kmemdup(&devinet_sysctl, sizeof(*t), GFP_KERNEL); if (!t) - return; + goto out; + for (i = 0; i < ARRAY_SIZE(t->devinet_vars) - 1; i++) { t->devinet_vars[i].data += (char *)p - (char *)&ipv4_devconf; t->devinet_vars[i].extra1 = p; - } - - if (dev) { - dev_name = dev->name; - t->devinet_dev[0].ctl_name = dev->ifindex; - } else { - dev_name = "default"; - t->devinet_dev[0].ctl_name = NET_PROTO_CONF_DEFAULT; + t->devinet_vars[i].extra2 = net; } /* @@ -1518,56 +1490,183 @@ static void devinet_sysctl_register(struct in_device *in_dev, * by sysctl and we wouldn't want anyone to change it under our feet * (see SIOCSIFNAME). */ - dev_name = kstrdup(dev_name, GFP_KERNEL); - if (!dev_name) - goto free; + t->dev_name = kstrdup(dev_name, GFP_KERNEL); + if (!t->dev_name) + goto free; - t->devinet_dev[0].procname = dev_name; - t->devinet_dev[0].child = t->devinet_vars; - t->devinet_conf_dir[0].child = t->devinet_dev; - t->devinet_proto_dir[0].child = t->devinet_conf_dir; - t->devinet_root_dir[0].child = t->devinet_proto_dir; + devinet_ctl_path[DEVINET_CTL_PATH_DEV].procname = t->dev_name; + devinet_ctl_path[DEVINET_CTL_PATH_DEV].ctl_name = ctl_name; - t->sysctl_header = register_sysctl_table(t->devinet_root_dir); + t->sysctl_header = register_net_sysctl_table(net, devinet_ctl_path, + t->devinet_vars); if (!t->sysctl_header) - goto free_procname; + goto free_procname; p->sysctl = t; - return; + return 0; - /* error path */ - free_procname: - kfree(dev_name); - free: +free_procname: + kfree(t->dev_name); +free: kfree(t); - return; +out: + return -ENOBUFS; } -static void devinet_sysctl_unregister(struct ipv4_devconf *p) +static void __devinet_sysctl_unregister(struct ipv4_devconf *cnf) +{ + struct devinet_sysctl_table *t = cnf->sysctl; + + if (t == NULL) + return; + + cnf->sysctl = NULL; + unregister_sysctl_table(t->sysctl_header); + kfree(t->dev_name); + kfree(t); +} + +static void devinet_sysctl_register(struct in_device *idev) +{ + neigh_sysctl_register(idev->dev, idev->arp_parms, NET_IPV4, + NET_IPV4_NEIGH, "ipv4", NULL, NULL); + __devinet_sysctl_register(idev->dev->nd_net, idev->dev->name, + idev->dev->ifindex, &idev->cnf); +} + +static void devinet_sysctl_unregister(struct in_device *idev) +{ + __devinet_sysctl_unregister(&idev->cnf); + neigh_sysctl_unregister(idev->arp_parms); +} + +static struct ctl_table ctl_forward_entry[] = { + { + .ctl_name = NET_IPV4_FORWARD, + .procname = "ip_forward", + .data = &ipv4_devconf.data[ + NET_IPV4_CONF_FORWARDING - 1], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = devinet_sysctl_forward, + .strategy = devinet_conf_sysctl, + .extra1 = &ipv4_devconf, + .extra2 = &init_net, + }, + { }, +}; + +static __net_initdata struct ctl_path net_ipv4_path[] = { + { .procname = "net", .ctl_name = CTL_NET, }, + { .procname = "ipv4", .ctl_name = NET_IPV4, }, + { }, +}; +#endif + +static __net_init int devinet_init_net(struct net *net) { - if (p->sysctl) { - struct devinet_sysctl_table *t = p->sysctl; - p->sysctl = NULL; - unregister_sysctl_table(t->sysctl_header); - kfree(t->devinet_dev[0].procname); - kfree(t); + int err; + struct ipv4_devconf *all, *dflt; +#ifdef CONFIG_SYSCTL + struct ctl_table *tbl = ctl_forward_entry; + struct ctl_table_header *forw_hdr; +#endif + + err = -ENOMEM; + all = &ipv4_devconf; + dflt = &ipv4_devconf_dflt; + + if (net != &init_net) { + all = kmemdup(all, sizeof(ipv4_devconf), GFP_KERNEL); + if (all == NULL) + goto err_alloc_all; + + dflt = kmemdup(dflt, sizeof(ipv4_devconf_dflt), GFP_KERNEL); + if (dflt == NULL) + goto err_alloc_dflt; + +#ifdef CONFIG_SYSCTL + tbl = kmemdup(tbl, sizeof(ctl_forward_entry), GFP_KERNEL); + if (tbl == NULL) + goto err_alloc_ctl; + + tbl[0].data = &all->data[NET_IPV4_CONF_FORWARDING - 1]; + tbl[0].extra1 = all; + tbl[0].extra2 = net; +#endif } + +#ifdef CONFIG_SYSCTL + err = __devinet_sysctl_register(net, "all", + NET_PROTO_CONF_ALL, all); + if (err < 0) + goto err_reg_all; + + err = __devinet_sysctl_register(net, "default", + NET_PROTO_CONF_DEFAULT, dflt); + if (err < 0) + goto err_reg_dflt; + + err = -ENOMEM; + forw_hdr = register_net_sysctl_table(net, net_ipv4_path, tbl); + if (forw_hdr == NULL) + goto err_reg_ctl; + net->ipv4.forw_hdr = forw_hdr; +#endif + + net->ipv4.devconf_all = all; + net->ipv4.devconf_dflt = dflt; + return 0; + +#ifdef CONFIG_SYSCTL +err_reg_ctl: + __devinet_sysctl_unregister(dflt); +err_reg_dflt: + __devinet_sysctl_unregister(all); +err_reg_all: + if (tbl != ctl_forward_entry) + kfree(tbl); +err_alloc_ctl: +#endif + if (dflt != &ipv4_devconf_dflt) + kfree(dflt); +err_alloc_dflt: + if (all != &ipv4_devconf) + kfree(all); +err_alloc_all: + return err; } + +static __net_exit void devinet_exit_net(struct net *net) +{ +#ifdef CONFIG_SYSCTL + struct ctl_table *tbl; + + tbl = net->ipv4.forw_hdr->ctl_table_arg; + unregister_net_sysctl_table(net->ipv4.forw_hdr); + __devinet_sysctl_unregister(net->ipv4.devconf_dflt); + __devinet_sysctl_unregister(net->ipv4.devconf_all); + kfree(tbl); #endif + kfree(net->ipv4.devconf_dflt); + kfree(net->ipv4.devconf_all); +} + +static __net_initdata struct pernet_operations devinet_ops = { + .init = devinet_init_net, + .exit = devinet_exit_net, +}; void __init devinet_init(void) { + register_pernet_subsys(&devinet_ops); + register_gifconf(PF_INET, inet_gifconf); register_netdevice_notifier(&ip_netdev_notifier); rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL); rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL); rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr); -#ifdef CONFIG_SYSCTL - devinet_sysctl.sysctl_header = - register_sysctl_table(devinet_sysctl.devinet_root_dir); - devinet_sysctl_register(NULL, &ipv4_devconf_dflt); -#endif } EXPORT_SYMBOL(in_dev_finish_destroy); diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 1738113268bc..28ea5c77ca23 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -163,7 +163,7 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) u8 nexthdr[2]; struct scatterlist *sg; int padlen; - int err; + int err = -EINVAL; if (!pskb_may_pull(skb, sizeof(*esph))) goto out; @@ -171,28 +171,31 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) if (elen <= 0 || (elen & (blksize-1))) goto out; + if ((err = skb_cow_data(skb, 0, &trailer)) < 0) + goto out; + nfrags = err; + + skb->ip_summed = CHECKSUM_NONE; + + spin_lock(&x->lock); + /* If integrity check is required, do this. */ if (esp->auth.icv_full_len) { u8 sum[alen]; err = esp_mac_digest(esp, skb, 0, skb->len - alen); if (err) - goto out; + goto unlock; if (skb_copy_bits(skb, skb->len - alen, sum, alen)) BUG(); if (unlikely(memcmp(esp->auth.work_icv, sum, alen))) { - x->stats.integrity_failed++; - goto out; + err = -EBADMSG; + goto unlock; } } - if ((nfrags = skb_cow_data(skb, 0, &trailer)) < 0) - goto out; - - skb->ip_summed = CHECKSUM_NONE; - esph = (struct ip_esp_hdr *)skb->data; /* Get ivec. This can be wrong, check against another impls. */ @@ -202,9 +205,10 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) sg = &esp->sgbuf[0]; if (unlikely(nfrags > ESP_NUM_FAST_SG)) { + err = -ENOMEM; sg = kmalloc(sizeof(struct scatterlist)*nfrags, GFP_ATOMIC); if (!sg) - goto out; + goto unlock; } sg_init_table(sg, nfrags); skb_to_sgvec(skb, sg, @@ -213,12 +217,17 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) err = crypto_blkcipher_decrypt(&desc, sg, sg, elen); if (unlikely(sg != &esp->sgbuf[0])) kfree(sg); + +unlock: + spin_unlock(&x->lock); + if (unlikely(err)) - return err; + goto out; if (skb_copy_bits(skb, skb->len-alen-2, nexthdr, 2)) BUG(); + err = -EINVAL; padlen = nexthdr[0]; if (padlen+2 >= elen) goto out; @@ -276,7 +285,7 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) return nexthdr[1]; out: - return -EINVAL; + return err; } static u32 esp4_get_mtu(struct xfrm_state *x, int mtu) diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 97abf934d185..d28261826bc2 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -47,59 +47,65 @@ #include <net/ip_fib.h> #include <net/rtnetlink.h> -#define FFprint(a...) printk(KERN_DEBUG a) +#ifndef CONFIG_IP_MULTIPLE_TABLES -static struct sock *fibnl; +static int __net_init fib4_rules_init(struct net *net) +{ + struct fib_table *local_table, *main_table; -#ifndef CONFIG_IP_MULTIPLE_TABLES + local_table = fib_hash_table(RT_TABLE_LOCAL); + if (local_table == NULL) + return -ENOMEM; -struct fib_table *ip_fib_local_table; -struct fib_table *ip_fib_main_table; + main_table = fib_hash_table(RT_TABLE_MAIN); + if (main_table == NULL) + goto fail; -#define FIB_TABLE_HASHSZ 1 -static struct hlist_head fib_table_hash[FIB_TABLE_HASHSZ]; + hlist_add_head_rcu(&local_table->tb_hlist, + &net->ipv4.fib_table_hash[TABLE_LOCAL_INDEX]); + hlist_add_head_rcu(&main_table->tb_hlist, + &net->ipv4.fib_table_hash[TABLE_MAIN_INDEX]); + return 0; -static void __init fib4_rules_init(void) -{ - ip_fib_local_table = fib_hash_init(RT_TABLE_LOCAL); - hlist_add_head_rcu(&ip_fib_local_table->tb_hlist, &fib_table_hash[0]); - ip_fib_main_table = fib_hash_init(RT_TABLE_MAIN); - hlist_add_head_rcu(&ip_fib_main_table->tb_hlist, &fib_table_hash[0]); +fail: + kfree(local_table); + return -ENOMEM; } #else -#define FIB_TABLE_HASHSZ 256 -static struct hlist_head fib_table_hash[FIB_TABLE_HASHSZ]; - -struct fib_table *fib_new_table(u32 id) +struct fib_table *fib_new_table(struct net *net, u32 id) { struct fib_table *tb; unsigned int h; if (id == 0) id = RT_TABLE_MAIN; - tb = fib_get_table(id); + tb = fib_get_table(net, id); if (tb) return tb; - tb = fib_hash_init(id); + + tb = fib_hash_table(id); if (!tb) return NULL; h = id & (FIB_TABLE_HASHSZ - 1); - hlist_add_head_rcu(&tb->tb_hlist, &fib_table_hash[h]); + hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]); return tb; } -struct fib_table *fib_get_table(u32 id) +struct fib_table *fib_get_table(struct net *net, u32 id) { struct fib_table *tb; struct hlist_node *node; + struct hlist_head *head; unsigned int h; if (id == 0) id = RT_TABLE_MAIN; h = id & (FIB_TABLE_HASHSZ - 1); + rcu_read_lock(); - hlist_for_each_entry_rcu(tb, node, &fib_table_hash[h], tb_hlist) { + head = &net->ipv4.fib_table_hash[h]; + hlist_for_each_entry_rcu(tb, node, head, tb_hlist) { if (tb->tb_id == id) { rcu_read_unlock(); return tb; @@ -110,15 +116,32 @@ struct fib_table *fib_get_table(u32 id) } #endif /* CONFIG_IP_MULTIPLE_TABLES */ -static void fib_flush(void) +void fib_select_default(struct net *net, + const struct flowi *flp, struct fib_result *res) +{ + struct fib_table *tb; + int table = RT_TABLE_MAIN; +#ifdef CONFIG_IP_MULTIPLE_TABLES + if (res->r == NULL || res->r->action != FR_ACT_TO_TBL) + return; + table = res->r->table; +#endif + tb = fib_get_table(net, table); + if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) + tb->tb_select_default(tb, flp, res); +} + +static void fib_flush(struct net *net) { int flushed = 0; struct fib_table *tb; struct hlist_node *node; + struct hlist_head *head; unsigned int h; for (h = 0; h < FIB_TABLE_HASHSZ; h++) { - hlist_for_each_entry(tb, node, &fib_table_hash[h], tb_hlist) + head = &net->ipv4.fib_table_hash[h]; + hlist_for_each_entry(tb, node, head, tb_hlist) flushed += tb->tb_flush(tb); } @@ -130,7 +153,7 @@ static void fib_flush(void) * Find the first device with a given source address. */ -struct net_device * ip_dev_find(__be32 addr) +struct net_device * ip_dev_find(struct net *net, __be32 addr) { struct flowi fl = { .nl_u = { .ip4_u = { .daddr = addr } } }; struct fib_result res; @@ -141,7 +164,7 @@ struct net_device * ip_dev_find(__be32 addr) res.r = NULL; #endif - local_table = fib_get_table(RT_TABLE_LOCAL); + local_table = fib_get_table(net, RT_TABLE_LOCAL); if (!local_table || local_table->tb_lookup(local_table, &fl, &res)) return NULL; if (res.type != RTN_LOCAL) @@ -155,33 +178,51 @@ out: return dev; } -unsigned inet_addr_type(__be32 addr) +/* + * Find address type as if only "dev" was present in the system. If + * on_dev is NULL then all interfaces are taken into consideration. + */ +static inline unsigned __inet_dev_addr_type(struct net *net, + const struct net_device *dev, + __be32 addr) { struct flowi fl = { .nl_u = { .ip4_u = { .daddr = addr } } }; struct fib_result res; unsigned ret = RTN_BROADCAST; struct fib_table *local_table; - if (ZERONET(addr) || BADCLASS(addr)) + if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr)) return RTN_BROADCAST; - if (MULTICAST(addr)) + if (ipv4_is_multicast(addr)) return RTN_MULTICAST; #ifdef CONFIG_IP_MULTIPLE_TABLES res.r = NULL; #endif - local_table = fib_get_table(RT_TABLE_LOCAL); + local_table = fib_get_table(net, RT_TABLE_LOCAL); if (local_table) { ret = RTN_UNICAST; if (!local_table->tb_lookup(local_table, &fl, &res)) { - ret = res.type; + if (!dev || dev == res.fi->fib_dev) + ret = res.type; fib_res_put(&res); } } return ret; } +unsigned int inet_addr_type(struct net *net, __be32 addr) +{ + return __inet_dev_addr_type(net, NULL, addr); +} + +unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev, + __be32 addr) +{ + return __inet_dev_addr_type(net, dev, addr); +} + /* Given (packet source, input interface) and optional (dst, oif, tos): - (main) check, that source is valid i.e. not broadcast or our local address. @@ -202,6 +243,7 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, struct fib_result res; int no_addr, rpf; int ret; + struct net *net; no_addr = rpf = 0; rcu_read_lock(); @@ -215,7 +257,8 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, if (in_dev == NULL) goto e_inval; - if (fib_lookup(&fl, &res)) + net = dev->nd_net; + if (fib_lookup(net, &fl, &res)) goto last_resort; if (res.type != RTN_UNICAST) goto e_inval_res; @@ -239,7 +282,7 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, fl.oif = dev->ifindex; ret = 0; - if (fib_lookup(&fl, &res) == 0) { + if (fib_lookup(net, &fl, &res) == 0) { if (res.type == RTN_UNICAST) { *spec_dst = FIB_RES_PREFSRC(res); ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; @@ -278,13 +321,14 @@ static int put_rtax(struct nlattr *mx, int len, int type, u32 value) return len + nla_total_size(4); } -static int rtentry_to_fib_config(int cmd, struct rtentry *rt, +static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt, struct fib_config *cfg) { __be32 addr; int plen; memset(cfg, 0, sizeof(*cfg)); + cfg->fc_nlinfo.nl_net = net; if (rt->rt_dst.sa_family != AF_INET) return -EAFNOSUPPORT; @@ -345,7 +389,7 @@ static int rtentry_to_fib_config(int cmd, struct rtentry *rt, colon = strchr(devname, ':'); if (colon) *colon = 0; - dev = __dev_get_by_name(&init_net, devname); + dev = __dev_get_by_name(net, devname); if (!dev) return -ENODEV; cfg->fc_oif = dev->ifindex; @@ -368,7 +412,7 @@ static int rtentry_to_fib_config(int cmd, struct rtentry *rt, if (rt->rt_gateway.sa_family == AF_INET && addr) { cfg->fc_gw = addr; if (rt->rt_flags & RTF_GATEWAY && - inet_addr_type(addr) == RTN_UNICAST) + inet_addr_type(net, addr) == RTN_UNICAST) cfg->fc_scope = RT_SCOPE_UNIVERSE; } @@ -409,7 +453,7 @@ static int rtentry_to_fib_config(int cmd, struct rtentry *rt, * Handle IP routing ioctl calls. These are used to manipulate the routing tables */ -int ip_rt_ioctl(unsigned int cmd, void __user *arg) +int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg) { struct fib_config cfg; struct rtentry rt; @@ -425,18 +469,18 @@ int ip_rt_ioctl(unsigned int cmd, void __user *arg) return -EFAULT; rtnl_lock(); - err = rtentry_to_fib_config(cmd, &rt, &cfg); + err = rtentry_to_fib_config(net, cmd, &rt, &cfg); if (err == 0) { struct fib_table *tb; if (cmd == SIOCDELRT) { - tb = fib_get_table(cfg.fc_table); + tb = fib_get_table(net, cfg.fc_table); if (tb) err = tb->tb_delete(tb, &cfg); else err = -ESRCH; } else { - tb = fib_new_table(cfg.fc_table); + tb = fib_new_table(net, cfg.fc_table); if (tb) err = tb->tb_insert(tb, &cfg); else @@ -466,8 +510,8 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX+1] = { [RTA_FLOW] = { .type = NLA_U32 }, }; -static int rtm_to_fib_config(struct sk_buff *skb, struct nlmsghdr *nlh, - struct fib_config *cfg) +static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, + struct nlmsghdr *nlh, struct fib_config *cfg) { struct nlattr *attr; int err, remaining; @@ -491,6 +535,7 @@ static int rtm_to_fib_config(struct sk_buff *skb, struct nlmsghdr *nlh, cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid; cfg->fc_nlinfo.nlh = nlh; + cfg->fc_nlinfo.nl_net = net; if (cfg->fc_type > RTN_MAX) { err = -EINVAL; @@ -538,15 +583,16 @@ errout: static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) { + struct net *net = skb->sk->sk_net; struct fib_config cfg; struct fib_table *tb; int err; - err = rtm_to_fib_config(skb, nlh, &cfg); + err = rtm_to_fib_config(net, skb, nlh, &cfg); if (err < 0) goto errout; - tb = fib_get_table(cfg.fc_table); + tb = fib_get_table(net, cfg.fc_table); if (tb == NULL) { err = -ESRCH; goto errout; @@ -559,15 +605,16 @@ errout: static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) { + struct net *net = skb->sk->sk_net; struct fib_config cfg; struct fib_table *tb; int err; - err = rtm_to_fib_config(skb, nlh, &cfg); + err = rtm_to_fib_config(net, skb, nlh, &cfg); if (err < 0) goto errout; - tb = fib_new_table(cfg.fc_table); + tb = fib_new_table(net, cfg.fc_table); if (tb == NULL) { err = -ENOBUFS; goto errout; @@ -580,10 +627,12 @@ errout: static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) { + struct net *net = skb->sk->sk_net; unsigned int h, s_h; unsigned int e = 0, s_e; struct fib_table *tb; struct hlist_node *node; + struct hlist_head *head; int dumped = 0; if (nlmsg_len(cb->nlh) >= sizeof(struct rtmsg) && @@ -595,7 +644,8 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) { e = 0; - hlist_for_each_entry(tb, node, &fib_table_hash[h], tb_hlist) { + head = &net->ipv4.fib_table_hash[h]; + hlist_for_each_entry(tb, node, head, tb_hlist) { if (e < s_e) goto next; if (dumped) @@ -624,6 +674,7 @@ out: static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa) { + struct net *net = ifa->ifa_dev->dev->nd_net; struct fib_table *tb; struct fib_config cfg = { .fc_protocol = RTPROT_KERNEL, @@ -633,12 +684,15 @@ static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifad .fc_prefsrc = ifa->ifa_local, .fc_oif = ifa->ifa_dev->dev->ifindex, .fc_nlflags = NLM_F_CREATE | NLM_F_APPEND, + .fc_nlinfo = { + .nl_net = net, + }, }; if (type == RTN_UNICAST) - tb = fib_new_table(RT_TABLE_MAIN); + tb = fib_new_table(net, RT_TABLE_MAIN); else - tb = fib_new_table(RT_TABLE_LOCAL); + tb = fib_new_table(net, RT_TABLE_LOCAL); if (tb == NULL) return; @@ -668,7 +722,7 @@ void fib_add_ifaddr(struct in_ifaddr *ifa) if (ifa->ifa_flags&IFA_F_SECONDARY) { prim = inet_ifa_byprefix(in_dev, prefix, mask); if (prim == NULL) { - printk(KERN_DEBUG "fib_add_ifaddr: bug: prim == NULL\n"); + printk(KERN_WARNING "fib_add_ifaddr: bug: prim == NULL\n"); return; } } @@ -682,7 +736,7 @@ void fib_add_ifaddr(struct in_ifaddr *ifa) if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF)) fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim); - if (!ZERONET(prefix) && !(ifa->ifa_flags&IFA_F_SECONDARY) && + if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags&IFA_F_SECONDARY) && (prefix != addr || ifa->ifa_prefixlen < 32)) { fib_magic(RTM_NEWROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, prefix, ifa->ifa_prefixlen, prim); @@ -715,7 +769,7 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa) else { prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask); if (prim == NULL) { - printk(KERN_DEBUG "fib_del_ifaddr: bug: prim == NULL\n"); + printk(KERN_WARNING "fib_del_ifaddr: bug: prim == NULL\n"); return; } } @@ -747,7 +801,7 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa) fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim); /* Check, that this local address finally disappeared. */ - if (inet_addr_type(ifa->ifa_local) != RTN_LOCAL) { + if (inet_addr_type(dev->nd_net, ifa->ifa_local) != RTN_LOCAL) { /* And the last, but not the least thing. We must flush stray FIB entries. @@ -755,7 +809,7 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa) for stray nexthop entries, then ignite fib_flush. */ if (fib_sync_down(ifa->ifa_local, NULL, 0)) - fib_flush(); + fib_flush(dev->nd_net); } } #undef LOCAL_OK @@ -797,11 +851,13 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb ) static void nl_fib_input(struct sk_buff *skb) { + struct net *net; struct fib_result_nl *frn; struct nlmsghdr *nlh; struct fib_table *tb; u32 pid; + net = skb->sk->sk_net; nlh = nlmsg_hdr(skb); if (skb->len < NLMSG_SPACE(0) || skb->len < nlh->nlmsg_len || nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*frn))) @@ -813,26 +869,37 @@ static void nl_fib_input(struct sk_buff *skb) nlh = nlmsg_hdr(skb); frn = (struct fib_result_nl *) NLMSG_DATA(nlh); - tb = fib_get_table(frn->tb_id_in); + tb = fib_get_table(net, frn->tb_id_in); nl_fib_lookup(frn, tb); pid = NETLINK_CB(skb).pid; /* pid of sending process */ NETLINK_CB(skb).pid = 0; /* from kernel */ NETLINK_CB(skb).dst_group = 0; /* unicast */ - netlink_unicast(fibnl, skb, pid, MSG_DONTWAIT); + netlink_unicast(net->ipv4.fibnl, skb, pid, MSG_DONTWAIT); +} + +static int nl_fib_lookup_init(struct net *net) +{ + struct sock *sk; + sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, 0, + nl_fib_input, NULL, THIS_MODULE); + if (sk == NULL) + return -EAFNOSUPPORT; + net->ipv4.fibnl = sk; + return 0; } -static void nl_fib_lookup_init(void) +static void nl_fib_lookup_exit(struct net *net) { - fibnl = netlink_kernel_create(&init_net, NETLINK_FIB_LOOKUP, 0, - nl_fib_input, NULL, THIS_MODULE); + netlink_kernel_release(net->ipv4.fibnl); + net->ipv4.fibnl = NULL; } static void fib_disable_ip(struct net_device *dev, int force) { if (fib_sync_down(0, dev, force)) - fib_flush(); + fib_flush(dev->nd_net); rt_cache_flush(0); arp_ifdown(dev); } @@ -869,9 +936,6 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo struct net_device *dev = ptr; struct in_device *in_dev = __in_dev_get_rtnl(dev); - if (dev->nd_net != &init_net) - return NOTIFY_DONE; - if (event == NETDEV_UNREGISTER) { fib_disable_ip(dev, 2); return NOTIFY_DONE; @@ -909,23 +973,92 @@ static struct notifier_block fib_netdev_notifier = { .notifier_call =fib_netdev_event, }; -void __init ip_fib_init(void) +static int __net_init ip_fib_net_init(struct net *net) { unsigned int i; + net->ipv4.fib_table_hash = kzalloc( + sizeof(struct hlist_head)*FIB_TABLE_HASHSZ, GFP_KERNEL); + if (net->ipv4.fib_table_hash == NULL) + return -ENOMEM; + for (i = 0; i < FIB_TABLE_HASHSZ; i++) - INIT_HLIST_HEAD(&fib_table_hash[i]); + INIT_HLIST_HEAD(&net->ipv4.fib_table_hash[i]); - fib4_rules_init(); + return fib4_rules_init(net); +} - register_netdevice_notifier(&fib_netdev_notifier); - register_inetaddr_notifier(&fib_inetaddr_notifier); - nl_fib_lookup_init(); +static void __net_exit ip_fib_net_exit(struct net *net) +{ + unsigned int i; + +#ifdef CONFIG_IP_MULTIPLE_TABLES + fib4_rules_exit(net); +#endif + for (i = 0; i < FIB_TABLE_HASHSZ; i++) { + struct fib_table *tb; + struct hlist_head *head; + struct hlist_node *node, *tmp; + + head = &net->ipv4.fib_table_hash[i]; + hlist_for_each_entry_safe(tb, node, tmp, head, tb_hlist) { + hlist_del(node); + tb->tb_flush(tb); + kfree(tb); + } + } + kfree(net->ipv4.fib_table_hash); +} + +static int __net_init fib_net_init(struct net *net) +{ + int error; + + error = ip_fib_net_init(net); + if (error < 0) + goto out; + error = nl_fib_lookup_init(net); + if (error < 0) + goto out_nlfl; + error = fib_proc_init(net); + if (error < 0) + goto out_proc; +out: + return error; + +out_proc: + nl_fib_lookup_exit(net); +out_nlfl: + ip_fib_net_exit(net); + goto out; +} + +static void __net_exit fib_net_exit(struct net *net) +{ + fib_proc_exit(net); + nl_fib_lookup_exit(net); + ip_fib_net_exit(net); +} + +static struct pernet_operations fib_net_ops = { + .init = fib_net_init, + .exit = fib_net_exit, +}; + +void __init ip_fib_init(void) +{ rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL); rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL); rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib); + + register_pernet_subsys(&fib_net_ops); + register_netdevice_notifier(&fib_netdev_notifier); + register_inetaddr_notifier(&fib_inetaddr_notifier); + + fib_hash_init(); } EXPORT_SYMBOL(inet_addr_type); +EXPORT_SYMBOL(inet_dev_addr_type); EXPORT_SYMBOL(ip_dev_find); diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c index 0dfee27cfbcd..a15b2f1b2721 100644 --- a/net/ipv4/fib_hash.c +++ b/net/ipv4/fib_hash.c @@ -52,6 +52,7 @@ struct fib_node { struct hlist_node fn_hash; struct list_head fn_alias; __be32 fn_key; + struct fib_alias fn_embedded_alias; }; struct fn_zone { @@ -102,10 +103,10 @@ static struct hlist_head *fz_hash_alloc(int divisor) unsigned long size = divisor * sizeof(struct hlist_head); if (size <= PAGE_SIZE) { - return kmalloc(size, GFP_KERNEL); + return kzalloc(size, GFP_KERNEL); } else { return (struct hlist_head *) - __get_free_pages(GFP_KERNEL, get_order(size)); + __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(size)); } } @@ -168,14 +169,13 @@ static void fn_rehash_zone(struct fn_zone *fz) new_hashmask = (new_divisor - 1); #if RT_CACHE_DEBUG >= 2 - printk("fn_rehash_zone: hash for zone %d grows from %d\n", fz->fz_order, old_divisor); + printk(KERN_DEBUG "fn_rehash_zone: hash for zone %d grows from %d\n", + fz->fz_order, old_divisor); #endif ht = fz_hash_alloc(new_divisor); if (ht) { - memset(ht, 0, new_divisor * sizeof(struct hlist_head)); - write_lock_bh(&fib_hash_lock); old_ht = fz->fz_hash; fz->fz_hash = ht; @@ -194,10 +194,13 @@ static inline void fn_free_node(struct fib_node * f) kmem_cache_free(fn_hash_kmem, f); } -static inline void fn_free_alias(struct fib_alias *fa) +static inline void fn_free_alias(struct fib_alias *fa, struct fib_node *f) { fib_release_info(fa->fa_info); - kmem_cache_free(fn_alias_kmem, fa); + if (fa == &f->fn_embedded_alias) + fa->fa_info = NULL; + else + kmem_cache_free(fn_alias_kmem, fa); } static struct fn_zone * @@ -219,7 +222,6 @@ fn_new_zone(struct fn_hash *table, int z) kfree(fz); return NULL; } - memset(fz->fz_hash, 0, fz->fz_divisor * sizeof(struct hlist_head *)); fz->fz_order = z; fz->fz_mask = inet_make_mask(z); @@ -275,8 +277,6 @@ out: return err; } -static int fn_hash_last_dflt=-1; - static void fn_hash_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res) { @@ -317,12 +317,9 @@ fn_hash_select_default(struct fib_table *tb, const struct flowi *flp, struct fib if (next_fi != res->fi) break; } else if (!fib_detect_death(fi, order, &last_resort, - &last_idx, &fn_hash_last_dflt)) { - if (res->fi) - fib_info_put(res->fi); - res->fi = fi; - atomic_inc(&fi->fib_clntref); - fn_hash_last_dflt = order; + &last_idx, tb->tb_default)) { + fib_result_assign(res, fi); + tb->tb_default = order; goto out; } fi = next_fi; @@ -331,27 +328,20 @@ fn_hash_select_default(struct fib_table *tb, const struct flowi *flp, struct fib } if (order <= 0 || fi == NULL) { - fn_hash_last_dflt = -1; + tb->tb_default = -1; goto out; } - if (!fib_detect_death(fi, order, &last_resort, &last_idx, &fn_hash_last_dflt)) { - if (res->fi) - fib_info_put(res->fi); - res->fi = fi; - atomic_inc(&fi->fib_clntref); - fn_hash_last_dflt = order; + if (!fib_detect_death(fi, order, &last_resort, &last_idx, + tb->tb_default)) { + fib_result_assign(res, fi); + tb->tb_default = order; goto out; } - if (last_idx >= 0) { - if (res->fi) - fib_info_put(res->fi); - res->fi = last_resort; - if (last_resort) - atomic_inc(&last_resort->fib_clntref); - } - fn_hash_last_dflt = last_idx; + if (last_idx >= 0) + fib_result_assign(res, last_resort); + tb->tb_default = last_idx; out: read_unlock(&fib_hash_lock); } @@ -490,15 +480,12 @@ static int fn_hash_insert(struct fib_table *tb, struct fib_config *cfg) goto out; err = -ENOBUFS; - new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL); - if (new_fa == NULL) - goto out; new_f = NULL; if (!f) { - new_f = kmem_cache_alloc(fn_hash_kmem, GFP_KERNEL); + new_f = kmem_cache_zalloc(fn_hash_kmem, GFP_KERNEL); if (new_f == NULL) - goto out_free_new_fa; + goto out; INIT_HLIST_NODE(&new_f->fn_hash); INIT_LIST_HEAD(&new_f->fn_alias); @@ -506,6 +493,12 @@ static int fn_hash_insert(struct fib_table *tb, struct fib_config *cfg) f = new_f; } + new_fa = &f->fn_embedded_alias; + if (new_fa->fa_info != NULL) { + new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL); + if (new_fa == NULL) + goto out_free_new_f; + } new_fa->fa_info = fi; new_fa->fa_tos = tos; new_fa->fa_type = cfg->fc_type; @@ -532,8 +525,8 @@ static int fn_hash_insert(struct fib_table *tb, struct fib_config *cfg) &cfg->fc_nlinfo, 0); return 0; -out_free_new_fa: - kmem_cache_free(fn_alias_kmem, new_fa); +out_free_new_f: + kmem_cache_free(fn_hash_kmem, new_f); out: fib_release_info(fi); return err; @@ -609,7 +602,7 @@ static int fn_hash_delete(struct fib_table *tb, struct fib_config *cfg) if (fa->fa_state & FA_S_ACCESSED) rt_cache_flush(-1); - fn_free_alias(fa); + fn_free_alias(fa, f); if (kill_fn) { fn_free_node(f); fz->fz_nent--; @@ -645,7 +638,7 @@ static int fn_flush_list(struct fn_zone *fz, int idx) fib_hash_genid++; write_unlock_bh(&fib_hash_lock); - fn_free_alias(fa); + fn_free_alias(fa, f); found++; } } @@ -761,25 +754,19 @@ static int fn_hash_dump(struct fib_table *tb, struct sk_buff *skb, struct netlin return skb->len; } -#ifdef CONFIG_IP_MULTIPLE_TABLES -struct fib_table * fib_hash_init(u32 id) -#else -struct fib_table * __init fib_hash_init(u32 id) -#endif +void __init fib_hash_init(void) { - struct fib_table *tb; + fn_hash_kmem = kmem_cache_create("ip_fib_hash", sizeof(struct fib_node), + 0, SLAB_PANIC, NULL); - if (fn_hash_kmem == NULL) - fn_hash_kmem = kmem_cache_create("ip_fib_hash", - sizeof(struct fib_node), - 0, SLAB_HWCACHE_ALIGN, - NULL); + fn_alias_kmem = kmem_cache_create("ip_fib_alias", sizeof(struct fib_alias), + 0, SLAB_PANIC, NULL); - if (fn_alias_kmem == NULL) - fn_alias_kmem = kmem_cache_create("ip_fib_alias", - sizeof(struct fib_alias), - 0, SLAB_HWCACHE_ALIGN, - NULL); +} + +struct fib_table *fib_hash_table(u32 id) +{ + struct fib_table *tb; tb = kmalloc(sizeof(struct fib_table) + sizeof(struct fn_hash), GFP_KERNEL); @@ -787,6 +774,7 @@ struct fib_table * __init fib_hash_init(u32 id) return NULL; tb->tb_id = id; + tb->tb_default = -1; tb->tb_lookup = fn_hash_lookup; tb->tb_insert = fn_hash_insert; tb->tb_delete = fn_hash_delete; @@ -801,6 +789,7 @@ struct fib_table * __init fib_hash_init(u32 id) #ifdef CONFIG_PROC_FS struct fib_iter_state { + struct seq_net_private p; struct fn_zone *zone; int bucket; struct hlist_head *hash_head; @@ -814,7 +803,11 @@ struct fib_iter_state { static struct fib_alias *fib_get_first(struct seq_file *seq) { struct fib_iter_state *iter = seq->private; - struct fn_hash *table = (struct fn_hash *) ip_fib_main_table->tb_data; + struct fib_table *main_table; + struct fn_hash *table; + + main_table = fib_get_table(iter->p.net, RT_TABLE_MAIN); + table = (struct fn_hash *)main_table->tb_data; iter->bucket = 0; iter->hash_head = NULL; @@ -949,11 +942,13 @@ static struct fib_alias *fib_get_idx(struct seq_file *seq, loff_t pos) } static void *fib_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(fib_hash_lock) { + struct fib_iter_state *iter = seq->private; void *v = NULL; read_lock(&fib_hash_lock); - if (ip_fib_main_table) + if (fib_get_table(iter->p.net, RT_TABLE_MAIN)) v = *pos ? fib_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; return v; } @@ -965,6 +960,7 @@ static void *fib_seq_next(struct seq_file *seq, void *v, loff_t *pos) } static void fib_seq_stop(struct seq_file *seq, void *v) + __releases(fib_hash_lock) { read_unlock(&fib_hash_lock); } @@ -1040,8 +1036,8 @@ static const struct seq_operations fib_seq_ops = { static int fib_seq_open(struct inode *inode, struct file *file) { - return seq_open_private(file, &fib_seq_ops, - sizeof(struct fib_iter_state)); + return seq_open_net(inode, file, &fib_seq_ops, + sizeof(struct fib_iter_state)); } static const struct file_operations fib_seq_fops = { @@ -1049,18 +1045,18 @@ static const struct file_operations fib_seq_fops = { .open = fib_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = seq_release_net, }; -int __init fib_proc_init(void) +int __net_init fib_proc_init(struct net *net) { - if (!proc_net_fops_create(&init_net, "route", S_IRUGO, &fib_seq_fops)) + if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_seq_fops)) return -ENOMEM; return 0; } -void __init fib_proc_exit(void) +void __net_exit fib_proc_exit(struct net *net) { - proc_net_remove(&init_net, "route"); + proc_net_remove(net, "route"); } #endif /* CONFIG_PROC_FS */ diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index eef9eec17e0c..2c1623d2768b 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -7,12 +7,14 @@ struct fib_alias { struct list_head fa_list; - struct rcu_head rcu; struct fib_info *fa_info; u8 fa_tos; u8 fa_type; u8 fa_scope; u8 fa_state; +#ifdef CONFIG_IP_FIB_TRIE + struct rcu_head rcu; +#endif }; #define FA_S_ACCESSED 0x01 @@ -36,6 +38,16 @@ extern struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio); extern int fib_detect_death(struct fib_info *fi, int order, struct fib_info **last_resort, - int *last_idx, int *dflt); + int *last_idx, int dflt); + +static inline void fib_result_assign(struct fib_result *res, + struct fib_info *fi) +{ + if (res->fi != NULL) + fib_info_put(res->fi); + res->fi = fi; + if (fi != NULL) + atomic_inc(&fi->fib_clntref); +} #endif /* _FIB_LOOKUP_H */ diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index a0ada3a8d8dd..19274d01afa4 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -32,8 +32,6 @@ #include <net/ip_fib.h> #include <net/fib_rules.h> -static struct fib_rules_ops fib4_rules_ops; - struct fib4_rule { struct fib_rule common; @@ -56,14 +54,14 @@ u32 fib_rules_tclass(struct fib_result *res) } #endif -int fib_lookup(struct flowi *flp, struct fib_result *res) +int fib_lookup(struct net *net, struct flowi *flp, struct fib_result *res) { struct fib_lookup_arg arg = { .result = res, }; int err; - err = fib_rules_lookup(&fib4_rules_ops, flp, 0, &arg); + err = fib_rules_lookup(net->ipv4.rules_ops, flp, 0, &arg); res->r = arg.rule; return err; @@ -93,7 +91,7 @@ static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp, goto errout; } - if ((tbl = fib_get_table(rule->table)) == NULL) + if ((tbl = fib_get_table(rule->fr_net, rule->table)) == NULL) goto errout; err = tbl->tb_lookup(tbl, flp, (struct fib_result *) arg->result); @@ -104,16 +102,6 @@ errout: } -void fib_select_default(const struct flowi *flp, struct fib_result *res) -{ - if (res->r && res->r->action == FR_ACT_TO_TBL && - FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) { - struct fib_table *tb; - if ((tb = fib_get_table(res->r->table)) != NULL) - tb->tb_select_default(tb, flp, res); - } -} - static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) { struct fib4_rule *r = (struct fib4_rule *) rule; @@ -130,13 +118,13 @@ static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) return 1; } -static struct fib_table *fib_empty_table(void) +static struct fib_table *fib_empty_table(struct net *net) { u32 id; for (id = 1; id <= RT_TABLE_MAX; id++) - if (fib_get_table(id) == NULL) - return fib_new_table(id); + if (fib_get_table(net, id) == NULL) + return fib_new_table(net, id); return NULL; } @@ -149,6 +137,7 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, struct nlmsghdr *nlh, struct fib_rule_hdr *frh, struct nlattr **tb) { + struct net *net = skb->sk->sk_net; int err = -EINVAL; struct fib4_rule *rule4 = (struct fib4_rule *) rule; @@ -159,7 +148,7 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, if (rule->action == FR_ACT_TO_TBL) { struct fib_table *table; - table = fib_empty_table(); + table = fib_empty_table(net); if (table == NULL) { err = -ENOBUFS; goto errout; @@ -245,14 +234,14 @@ nla_put_failure: return -ENOBUFS; } -static u32 fib4_rule_default_pref(void) +static u32 fib4_rule_default_pref(struct fib_rules_ops *ops) { struct list_head *pos; struct fib_rule *rule; - if (!list_empty(&fib4_rules_ops.rules_list)) { - pos = fib4_rules_ops.rules_list.next; - if (pos->next != &fib4_rules_ops.rules_list) { + if (!list_empty(&ops->rules_list)) { + pos = ops->rules_list.next; + if (pos->next != &ops->rules_list) { rule = list_entry(pos->next, struct fib_rule, list); if (rule->pref) return rule->pref - 1; @@ -274,7 +263,7 @@ static void fib4_rule_flush_cache(void) rt_cache_flush(-1); } -static struct fib_rules_ops fib4_rules_ops = { +static struct fib_rules_ops fib4_rules_ops_template = { .family = AF_INET, .rule_size = sizeof(struct fib4_rule), .addr_size = sizeof(u32), @@ -288,31 +277,53 @@ static struct fib_rules_ops fib4_rules_ops = { .flush_cache = fib4_rule_flush_cache, .nlgroup = RTNLGRP_IPV4_RULE, .policy = fib4_rule_policy, - .rules_list = LIST_HEAD_INIT(fib4_rules_ops.rules_list), .owner = THIS_MODULE, }; -static int __init fib_default_rules_init(void) +static int fib_default_rules_init(struct fib_rules_ops *ops) { int err; - err = fib_default_rule_add(&fib4_rules_ops, 0, - RT_TABLE_LOCAL, FIB_RULE_PERMANENT); + err = fib_default_rule_add(ops, 0, RT_TABLE_LOCAL, FIB_RULE_PERMANENT); if (err < 0) return err; - err = fib_default_rule_add(&fib4_rules_ops, 0x7FFE, - RT_TABLE_MAIN, 0); + err = fib_default_rule_add(ops, 0x7FFE, RT_TABLE_MAIN, 0); if (err < 0) return err; - err = fib_default_rule_add(&fib4_rules_ops, 0x7FFF, - RT_TABLE_DEFAULT, 0); + err = fib_default_rule_add(ops, 0x7FFF, RT_TABLE_DEFAULT, 0); if (err < 0) return err; return 0; } -void __init fib4_rules_init(void) +int __net_init fib4_rules_init(struct net *net) +{ + int err; + struct fib_rules_ops *ops; + + ops = kmemdup(&fib4_rules_ops_template, sizeof(*ops), GFP_KERNEL); + if (ops == NULL) + return -ENOMEM; + INIT_LIST_HEAD(&ops->rules_list); + ops->fro_net = net; + + fib_rules_register(ops); + + err = fib_default_rules_init(ops); + if (err < 0) + goto fail; + net->ipv4.rules_ops = ops; + return 0; + +fail: + /* also cleans all rules already added */ + fib_rules_unregister(ops); + kfree(ops); + return err; +} + +void __net_exit fib4_rules_exit(struct net *net) { - BUG_ON(fib_default_rules_init()); - fib_rules_register(&fib4_rules_ops); + fib_rules_unregister(net->ipv4.rules_ops); + kfree(net->ipv4.rules_ops); } diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 1351a2617dce..c7912866d987 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -47,8 +47,6 @@ #include "fib_lookup.h" -#define FSprintk(a...) - static DEFINE_SPINLOCK(fib_info_lock); static struct hlist_head *fib_info_hash; static struct hlist_head *fib_info_laddrhash; @@ -145,7 +143,7 @@ static const struct void free_fib_info(struct fib_info *fi) { if (fi->fib_dead == 0) { - printk("Freeing alive fib_info %p\n", fi); + printk(KERN_WARNING "Freeing alive fib_info %p\n", fi); return; } change_nexthops(fi) { @@ -196,6 +194,15 @@ static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info * return 0; } +static inline unsigned int fib_devindex_hashfn(unsigned int val) +{ + unsigned int mask = DEVINDEX_HASHSIZE - 1; + + return (val ^ + (val >> DEVINDEX_HASHBITS) ^ + (val >> (DEVINDEX_HASHBITS * 2))) & mask; +} + static inline unsigned int fib_info_hashfn(const struct fib_info *fi) { unsigned int mask = (fib_hash_size - 1); @@ -204,6 +211,9 @@ static inline unsigned int fib_info_hashfn(const struct fib_info *fi) val ^= fi->fib_protocol; val ^= (__force u32)fi->fib_prefsrc; val ^= fi->fib_priority; + for_nexthops(fi) { + val ^= fib_devindex_hashfn(nh->nh_oif); + } endfor_nexthops(fi) return (val ^ (val >> 7) ^ (val >> 12)) & mask; } @@ -234,15 +244,6 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi) return NULL; } -static inline unsigned int fib_devindex_hashfn(unsigned int val) -{ - unsigned int mask = DEVINDEX_HASHSIZE - 1; - - return (val ^ - (val >> DEVINDEX_HASHBITS) ^ - (val >> (DEVINDEX_HASHBITS * 2))) & mask; -} - /* Check, that the gateway is already configured. Used only by redirect accept routine. */ @@ -320,11 +321,11 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, kfree_skb(skb); goto errout; } - err = rtnl_notify(skb, info->pid, RTNLGRP_IPV4_ROUTE, + err = rtnl_notify(skb, info->nl_net, info->pid, RTNLGRP_IPV4_ROUTE, info->nlh, GFP_KERNEL); errout: if (err < 0) - rtnl_set_sk_err(RTNLGRP_IPV4_ROUTE, err); + rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err); } /* Return the first fib alias matching TOS with @@ -346,7 +347,7 @@ struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio) } int fib_detect_death(struct fib_info *fi, int order, - struct fib_info **last_resort, int *last_idx, int *dflt) + struct fib_info **last_resort, int *last_idx, int dflt) { struct neighbour *n; int state = NUD_NONE; @@ -358,10 +359,10 @@ int fib_detect_death(struct fib_info *fi, int order, } if (state==NUD_REACHABLE) return 0; - if ((state&NUD_VALID) && order != *dflt) + if ((state&NUD_VALID) && order != dflt) return 0; if ((state&NUD_VALID) || - (*last_idx<0 && order > *dflt)) { + (*last_idx<0 && order > dflt)) { *last_resort = fi; *last_idx = order; } @@ -518,7 +519,9 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, struct fib_nh *nh) { int err; + struct net *net; + net = cfg->fc_nlinfo.nl_net; if (nh->nh_gw) { struct fib_result res; @@ -531,9 +534,9 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, if (cfg->fc_scope >= RT_SCOPE_LINK) return -EINVAL; - if (inet_addr_type(nh->nh_gw) != RTN_UNICAST) + if (inet_addr_type(net, nh->nh_gw) != RTN_UNICAST) return -EINVAL; - if ((dev = __dev_get_by_index(&init_net, nh->nh_oif)) == NULL) + if ((dev = __dev_get_by_index(net, nh->nh_oif)) == NULL) return -ENODEV; if (!(dev->flags&IFF_UP)) return -ENETDOWN; @@ -556,7 +559,7 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, /* It is not necessary, but requires a bit of thinking */ if (fl.fl4_scope < RT_SCOPE_LINK) fl.fl4_scope = RT_SCOPE_LINK; - if ((err = fib_lookup(&fl, &res)) != 0) + if ((err = fib_lookup(net, &fl, &res)) != 0) return err; } err = -EINVAL; @@ -580,7 +583,7 @@ out: if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK)) return -EINVAL; - in_dev = inetdev_by_index(nh->nh_oif); + in_dev = inetdev_by_index(net, nh->nh_oif); if (in_dev == NULL) return -ENODEV; if (!(in_dev->dev->flags&IFF_UP)) { @@ -605,10 +608,10 @@ static inline unsigned int fib_laddr_hashfn(__be32 val) static struct hlist_head *fib_hash_alloc(int bytes) { if (bytes <= PAGE_SIZE) - return kmalloc(bytes, GFP_KERNEL); + return kzalloc(bytes, GFP_KERNEL); else return (struct hlist_head *) - __get_free_pages(GFP_KERNEL, get_order(bytes)); + __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(bytes)); } static void fib_hash_free(struct hlist_head *hash, int bytes) @@ -712,12 +715,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg) if (!new_info_hash || !new_laddrhash) { fib_hash_free(new_info_hash, bytes); fib_hash_free(new_laddrhash, bytes); - } else { - memset(new_info_hash, 0, bytes); - memset(new_laddrhash, 0, bytes); - + } else fib_hash_move(new_info_hash, new_laddrhash, new_size); - } if (!fib_hash_size) goto failure; @@ -799,7 +798,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg) if (nhs != 1 || nh->nh_gw) goto err_inval; nh->nh_scope = RT_SCOPE_NOWHERE; - nh->nh_dev = dev_get_by_index(&init_net, fi->fib_nh->nh_oif); + nh->nh_dev = dev_get_by_index(cfg->fc_nlinfo.nl_net, + fi->fib_nh->nh_oif); err = -ENODEV; if (nh->nh_dev == NULL) goto failure; @@ -813,7 +813,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg) if (fi->fib_prefsrc) { if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst || fi->fib_prefsrc != cfg->fc_dst) - if (inet_addr_type(fi->fib_prefsrc) != RTN_LOCAL) + if (inet_addr_type(cfg->fc_nlinfo.nl_net, + fi->fib_prefsrc) != RTN_LOCAL) goto err_inval; } @@ -914,7 +915,8 @@ int fib_semantic_match(struct list_head *head, const struct flowi *flp, continue; default: - printk(KERN_DEBUG "impossible 102\n"); + printk(KERN_WARNING "fib_semantic_match bad type %#x\n", + fa->fa_type); return -EINVAL; } } diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 1010b469d7d3..f2f47033f31f 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -82,7 +82,6 @@ #include <net/ip_fib.h> #include "fib_lookup.h" -#undef CONFIG_IP_FIB_TRIE_STATS #define MAX_STAT_DEPTH 32 #define KEYLENGTH (8*sizeof(t_key)) @@ -98,13 +97,13 @@ typedef unsigned int t_key; #define IS_LEAF(n) (n->parent & T_LEAF) struct node { - t_key key; unsigned long parent; + t_key key; }; struct leaf { - t_key key; unsigned long parent; + t_key key; struct hlist_head list; struct rcu_head rcu; }; @@ -117,12 +116,12 @@ struct leaf_info { }; struct tnode { - t_key key; unsigned long parent; - unsigned short pos:5; /* 2log(KEYLENGTH) bits needed */ - unsigned short bits:5; /* 2log(KEYLENGTH) bits needed */ - unsigned short full_children; /* KEYLENGTH bits needed */ - unsigned short empty_children; /* KEYLENGTH bits needed */ + t_key key; + unsigned char pos; /* 2log(KEYLENGTH) bits needed */ + unsigned char bits; /* 2log(KEYLENGTH) bits needed */ + unsigned int full_children; /* KEYLENGTH bits needed */ + unsigned int empty_children; /* KEYLENGTH bits needed */ struct rcu_head rcu; struct node *child[0]; }; @@ -144,6 +143,7 @@ struct trie_stat { unsigned int tnodes; unsigned int leaves; unsigned int nullpointers; + unsigned int prefixes; unsigned int nodesizes[MAX_STAT_DEPTH]; }; @@ -152,25 +152,28 @@ struct trie { #ifdef CONFIG_IP_FIB_TRIE_STATS struct trie_use_stats stats; #endif - int size; - unsigned int revision; }; static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n); -static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull); +static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, + int wasfull); static struct node *resize(struct trie *t, struct tnode *tn); static struct tnode *inflate(struct trie *t, struct tnode *tn); static struct tnode *halve(struct trie *t, struct tnode *tn); static void tnode_free(struct tnode *tn); static struct kmem_cache *fn_alias_kmem __read_mostly; -static struct trie *trie_local = NULL, *trie_main = NULL; +static struct kmem_cache *trie_leaf_kmem __read_mostly; static inline struct tnode *node_parent(struct node *node) { - struct tnode *ret; + return (struct tnode *)(node->parent & ~NODE_TYPE_MASK); +} + +static inline struct tnode *node_parent_rcu(struct node *node) +{ + struct tnode *ret = node_parent(node); - ret = (struct tnode *)(node->parent & ~NODE_TYPE_MASK); return rcu_dereference(ret); } @@ -180,13 +183,18 @@ static inline void node_set_parent(struct node *node, struct tnode *ptr) (unsigned long)ptr | NODE_TYPE(node)); } -/* rcu_read_lock needs to be hold by caller from readside */ +static inline struct node *tnode_get_child(struct tnode *tn, unsigned int i) +{ + BUG_ON(i >= 1U << tn->bits); + + return tn->child[i]; +} -static inline struct node *tnode_get_child(struct tnode *tn, int i) +static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i) { - BUG_ON(i >= 1 << tn->bits); + struct node *ret = tnode_get_child(tn, i); - return rcu_dereference(tn->child[i]); + return rcu_dereference(ret); } static inline int tnode_child_length(const struct tnode *tn) @@ -300,10 +308,10 @@ static inline void check_tnode(const struct tnode *tn) WARN_ON(tn && tn->pos+tn->bits > 32); } -static int halve_threshold = 25; -static int inflate_threshold = 50; -static int halve_threshold_root = 8; -static int inflate_threshold_root = 15; +static const int halve_threshold = 25; +static const int inflate_threshold = 50; +static const int halve_threshold_root = 8; +static const int inflate_threshold_root = 15; static void __alias_free_mem(struct rcu_head *head) @@ -319,7 +327,8 @@ static inline void alias_free_mem_rcu(struct fib_alias *fa) static void __leaf_free_rcu(struct rcu_head *head) { - kfree(container_of(head, struct leaf, rcu)); + struct leaf *l = container_of(head, struct leaf, rcu); + kmem_cache_free(trie_leaf_kmem, l); } static void __leaf_info_free_rcu(struct rcu_head *head) @@ -332,12 +341,12 @@ static inline void free_leaf_info(struct leaf_info *leaf) call_rcu(&leaf->rcu, __leaf_info_free_rcu); } -static struct tnode *tnode_alloc(unsigned int size) +static struct tnode *tnode_alloc(size_t size) { struct page *pages; if (size <= PAGE_SIZE) - return kcalloc(size, 1, GFP_KERNEL); + return kzalloc(size, GFP_KERNEL); pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, get_order(size)); if (!pages) @@ -349,8 +358,8 @@ static struct tnode *tnode_alloc(unsigned int size) static void __tnode_free_rcu(struct rcu_head *head) { struct tnode *tn = container_of(head, struct tnode, rcu); - unsigned int size = sizeof(struct tnode) + - (1 << tn->bits) * sizeof(struct node *); + size_t size = sizeof(struct tnode) + + (sizeof(struct node *) << tn->bits); if (size <= PAGE_SIZE) kfree(tn); @@ -369,7 +378,7 @@ static inline void tnode_free(struct tnode *tn) static struct leaf *leaf_new(void) { - struct leaf *l = kmalloc(sizeof(struct leaf), GFP_KERNEL); + struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL); if (l) { l->parent = T_LEAF; INIT_HLIST_HEAD(&l->list); @@ -387,14 +396,12 @@ static struct leaf_info *leaf_info_new(int plen) return li; } -static struct tnode* tnode_new(t_key key, int pos, int bits) +static struct tnode *tnode_new(t_key key, int pos, int bits) { - int nchildren = 1<<bits; - int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *); + size_t sz = sizeof(struct tnode) + (sizeof(struct node *) << bits); struct tnode *tn = tnode_alloc(sz); if (tn) { - memset(tn, 0, sz); tn->parent = T_TNODE; tn->pos = pos; tn->bits = bits; @@ -403,8 +410,8 @@ static struct tnode* tnode_new(t_key key, int pos, int bits) tn->empty_children = 1<<bits; } - pr_debug("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode), - (unsigned int) (sizeof(struct node) * 1<<bits)); + pr_debug("AT %p s=%u %lu\n", tn, (unsigned int) sizeof(struct tnode), + (unsigned long) (sizeof(struct node) << bits)); return tn; } @@ -421,7 +428,8 @@ static inline int tnode_full(const struct tnode *tn, const struct node *n) return ((struct tnode *) n)->pos == tn->pos + tn->bits; } -static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n) +static inline void put_child(struct trie *t, struct tnode *tn, int i, + struct node *n) { tnode_put_child_reorg(tn, i, n, -1); } @@ -431,14 +439,14 @@ static inline void put_child(struct trie *t, struct tnode *tn, int i, struct nod * Update the value of full_children and empty_children. */ -static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull) +static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, + int wasfull) { struct node *chi = tn->child[i]; int isfull; BUG_ON(i >= 1<<tn->bits); - /* update emptyChildren */ if (n == NULL && chi != NULL) tn->empty_children++; @@ -571,11 +579,13 @@ static struct node *resize(struct trie *t, struct tnode *tn) err = 0; max_resize = 10; while ((tn->full_children > 0 && max_resize-- && - 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >= - inflate_threshold_use * tnode_child_length(tn))) { + 50 * (tn->full_children + tnode_child_length(tn) + - tn->empty_children) + >= inflate_threshold_use * tnode_child_length(tn))) { old_tn = tn; tn = inflate(t, tn); + if (IS_ERR(tn)) { tn = old_tn; #ifdef CONFIG_IP_FIB_TRIE_STATS @@ -587,11 +597,13 @@ static struct node *resize(struct trie *t, struct tnode *tn) if (max_resize < 0) { if (!tn->parent) - printk(KERN_WARNING "Fix inflate_threshold_root. Now=%d size=%d bits\n", - inflate_threshold_root, tn->bits); + pr_warning("Fix inflate_threshold_root." + " Now=%d size=%d bits\n", + inflate_threshold_root, tn->bits); else - printk(KERN_WARNING "Fix inflate_threshold. Now=%d size=%d bits\n", - inflate_threshold, tn->bits); + pr_warning("Fix inflate_threshold." + " Now=%d size=%d bits\n", + inflate_threshold, tn->bits); } check_tnode(tn); @@ -628,11 +640,13 @@ static struct node *resize(struct trie *t, struct tnode *tn) if (max_resize < 0) { if (!tn->parent) - printk(KERN_WARNING "Fix halve_threshold_root. Now=%d size=%d bits\n", - halve_threshold_root, tn->bits); + pr_warning("Fix halve_threshold_root." + " Now=%d size=%d bits\n", + halve_threshold_root, tn->bits); else - printk(KERN_WARNING "Fix halve_threshold. Now=%d size=%d bits\n", - halve_threshold, tn->bits); + pr_warning("Fix halve_threshold." + " Now=%d size=%d bits\n", + halve_threshold, tn->bits); } /* Only one child remains */ @@ -656,7 +670,6 @@ static struct node *resize(struct trie *t, struct tnode *tn) static struct tnode *inflate(struct trie *t, struct tnode *tn) { - struct tnode *inode; struct tnode *oldtnode = tn; int olen = tnode_child_length(tn); int i; @@ -676,8 +689,9 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) */ for (i = 0; i < olen; i++) { - struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i); + struct tnode *inode; + inode = (struct tnode *) tnode_get_child(oldtnode, i); if (inode && IS_TNODE(inode) && inode->pos == oldtnode->pos + oldtnode->bits && @@ -704,6 +718,7 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) } for (i = 0; i < olen; i++) { + struct tnode *inode; struct node *node = tnode_get_child(oldtnode, i); struct tnode *left, *right; int size, j; @@ -716,8 +731,9 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) if (IS_LEAF(node) || ((struct tnode *) node)->pos > tn->pos + tn->bits - 1) { - if (tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits, - 1) == 0) + if (tkey_extract_bits(node->key, + oldtnode->pos + oldtnode->bits, + 1) == 0) put_child(t, tn, 2*i, node); else put_child(t, tn, 2*i+1, node); @@ -877,19 +893,6 @@ nomem: } } -static void trie_init(struct trie *t) -{ - if (!t) - return; - - t->size = 0; - rcu_assign_pointer(t->trie, NULL); - t->revision = 0; -#ifdef CONFIG_IP_FIB_TRIE_STATS - memset(&t->stats, 0, sizeof(struct trie_use_stats)); -#endif -} - /* readside must use rcu_read_lock currently dump routines via get_fa_head and dump */ @@ -906,7 +909,7 @@ static struct leaf_info *find_leaf_info(struct leaf *l, int plen) return NULL; } -static inline struct list_head * get_fa_head(struct leaf *l, int plen) +static inline struct list_head *get_fa_head(struct leaf *l, int plen) { struct leaf_info *li = find_leaf_info(l, plen); @@ -956,7 +959,10 @@ fib_find_node(struct trie *t, u32 key) if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) { pos = tn->pos + tn->bits; - n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits)); + n = tnode_get_child_rcu(tn, + tkey_extract_bits(key, + tn->pos, + tn->bits)); } else break; } @@ -977,8 +983,10 @@ static struct node *trie_rebalance(struct trie *t, struct tnode *tn) while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) { cindex = tkey_extract_bits(key, tp->pos, tp->bits); wasfull = tnode_full(tp, tnode_get_child(tp, cindex)); - tn = (struct tnode *) resize (t, (struct tnode *)tn); - tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull); + tn = (struct tnode *) resize(t, (struct tnode *)tn); + + tnode_put_child_reorg((struct tnode *)tp, cindex, + (struct node *)tn, wasfull); tp = node_parent((struct node *) tn); if (!tp) @@ -988,15 +996,14 @@ static struct node *trie_rebalance(struct trie *t, struct tnode *tn) /* Handle last (top) tnode */ if (IS_TNODE(tn)) - tn = (struct tnode*) resize(t, (struct tnode *)tn); + tn = (struct tnode *)resize(t, (struct tnode *)tn); - return (struct node*) tn; + return (struct node *)tn; } /* only used from updater-side */ -static struct list_head * -fib_insert_node(struct trie *t, int *err, u32 key, int plen) +static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) { int pos, newpos; struct tnode *tp = NULL, *tn = NULL; @@ -1036,7 +1043,10 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen) if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) { tp = tn; pos = tn->pos + tn->bits; - n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits)); + n = tnode_get_child(tn, + tkey_extract_bits(key, + tn->pos, + tn->bits)); BUG_ON(n && node_parent(n) != tn); } else @@ -1054,34 +1064,27 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen) /* Case 1: n is a leaf. Compare prefixes */ if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) { - struct leaf *l = (struct leaf *) n; - + l = (struct leaf *) n; li = leaf_info_new(plen); - if (!li) { - *err = -ENOMEM; - goto err; - } + if (!li) + return NULL; fa_head = &li->falh; insert_leaf_info(&l->list, li); goto done; } - t->size++; l = leaf_new(); - if (!l) { - *err = -ENOMEM; - goto err; - } + if (!l) + return NULL; l->key = key; li = leaf_info_new(plen); if (!li) { tnode_free((struct tnode *) l); - *err = -ENOMEM; - goto err; + return NULL; } fa_head = &li->falh; @@ -1117,8 +1120,7 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen) if (!tn) { free_leaf_info(li); tnode_free((struct tnode *) l); - *err = -ENOMEM; - goto err; + return NULL; } node_set_parent((struct node *)tn, tp); @@ -1129,23 +1131,23 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen) if (tp) { cindex = tkey_extract_bits(key, tp->pos, tp->bits); - put_child(t, (struct tnode *)tp, cindex, (struct node *)tn); + put_child(t, (struct tnode *)tp, cindex, + (struct node *)tn); } else { - rcu_assign_pointer(t->trie, (struct node *)tn); /* First tnode */ + rcu_assign_pointer(t->trie, (struct node *)tn); tp = tn; } } if (tp && tp->pos + tp->bits > 32) - printk(KERN_WARNING "fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n", - tp, tp->pos, tp->bits, key, plen); + pr_warning("fib_trie" + " tp=%p pos=%d, bits=%d, key=%0x plen=%d\n", + tp, tp->pos, tp->bits, key, plen); /* Rebalance the trie */ rcu_assign_pointer(t->trie, trie_rebalance(t, tp)); done: - t->revision++; -err: return fa_head; } @@ -1253,10 +1255,10 @@ static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg) break; if (fa->fa_type == cfg->fc_type && fa->fa_scope == cfg->fc_scope && - fa->fa_info == fi) { + fa->fa_info == fi) goto out; - } } + if (!(cfg->fc_nlflags & NLM_F_APPEND)) fa = fa_orig; } @@ -1279,10 +1281,11 @@ static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg) */ if (!fa_head) { - err = 0; - fa_head = fib_insert_node(t, &err, key, plen); - if (err) + fa_head = fib_insert_node(t, key, plen); + if (unlikely(!fa_head)) { + err = -ENOMEM; goto out_free_new_fa; + } } list_add_tail_rcu(&new_fa->fa_list, @@ -1302,40 +1305,41 @@ err: return err; } - /* should be called with rcu_read_lock */ -static inline int check_leaf(struct trie *t, struct leaf *l, - t_key key, int *plen, const struct flowi *flp, - struct fib_result *res) +static int check_leaf(struct trie *t, struct leaf *l, + t_key key, const struct flowi *flp, + struct fib_result *res) { - int err, i; - __be32 mask; struct leaf_info *li; struct hlist_head *hhead = &l->list; struct hlist_node *node; hlist_for_each_entry_rcu(li, node, hhead, hlist) { - i = li->plen; - mask = inet_make_mask(i); + int err; + int plen = li->plen; + __be32 mask = inet_make_mask(plen); + if (l->key != (key & ntohl(mask))) continue; - if ((err = fib_semantic_match(&li->falh, flp, res, htonl(l->key), mask, i)) <= 0) { - *plen = i; + err = fib_semantic_match(&li->falh, flp, res, + htonl(l->key), mask, plen); + #ifdef CONFIG_IP_FIB_TRIE_STATS + if (err <= 0) t->stats.semantic_match_passed++; + else + t->stats.semantic_match_miss++; #endif - return err; - } -#ifdef CONFIG_IP_FIB_TRIE_STATS - t->stats.semantic_match_miss++; -#endif + if (err <= 0) + return plen; } - return 1; + + return -1; } -static int -fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res) +static int fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, + struct fib_result *res) { struct trie *t = (struct trie *) tb->tb_data; int plen, ret = 0; @@ -1362,10 +1366,13 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result /* Just a leaf? */ if (IS_LEAF(n)) { - if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0) - goto found; - goto failed; + plen = check_leaf(t, (struct leaf *)n, key, flp, res); + if (plen < 0) + goto failed; + ret = 0; + goto found; } + pn = (struct tnode *) n; chopped_off = 0; @@ -1387,14 +1394,14 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result } if (IS_LEAF(n)) { - if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0) - goto found; - else + plen = check_leaf(t, (struct leaf *)n, key, flp, res); + if (plen < 0) goto backtrace; + + ret = 0; + goto found; } -#define HL_OPTIMIZE -#ifdef HL_OPTIMIZE cn = (struct tnode *)n; /* @@ -1423,12 +1430,13 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result * *are* zero. */ - /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */ + /* NOTA BENE: Checking only skipped bits + for the new node here */ if (current_prefix_length < pos+bits) { if (tkey_extract_bits(cn->key, current_prefix_length, - cn->pos - current_prefix_length) != 0 || - !(cn->child[0])) + cn->pos - current_prefix_length) + || !(cn->child[0])) goto backtrace; } @@ -1451,14 +1459,17 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result * new tnode's key. */ - /* Note: We aren't very concerned about the piece of the key - * that precede pn->pos+pn->bits, since these have already been - * checked. The bits after cn->pos aren't checked since these are - * by definition "unknown" at this point. Thus, what we want to - * see is if we are about to enter the "prefix matching" state, - * and in that case verify that the skipped bits that will prevail - * throughout this subtree are zero, as they have to be if we are - * to find a matching prefix. + /* + * Note: We aren't very concerned about the piece of + * the key that precede pn->pos+pn->bits, since these + * have already been checked. The bits after cn->pos + * aren't checked since these are by definition + * "unknown" at this point. Thus, what we want to see + * is if we are about to enter the "prefix matching" + * state, and in that case verify that the skipped + * bits that will prevail throughout this subtree are + * zero, as they have to be if we are to find a + * matching prefix. */ node_prefix = mask_pfx(cn->key, cn->pos); @@ -1466,13 +1477,15 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result pref_mismatch = key_prefix^node_prefix; mp = 0; - /* In short: If skipped bits in this node do not match the search - * key, enter the "prefix matching" state.directly. + /* + * In short: If skipped bits in this node do not match + * the search key, enter the "prefix matching" + * state.directly. */ if (pref_mismatch) { while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) { mp++; - pref_mismatch = pref_mismatch <<1; + pref_mismatch = pref_mismatch << 1; } key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp); @@ -1482,7 +1495,7 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result if (current_prefix_length >= cn->pos) current_prefix_length = mp; } -#endif + pn = (struct tnode *)n; /* Descend */ chopped_off = 0; continue; @@ -1491,12 +1504,14 @@ backtrace: chopped_off++; /* As zero don't change the child key (cindex) */ - while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1)))) + while ((chopped_off <= pn->bits) + && !(cindex & (1<<(chopped_off-1)))) chopped_off++; /* Decrease current_... with bits chopped off */ if (current_prefix_length > pn->pos + pn->bits - chopped_off) - current_prefix_length = pn->pos + pn->bits - chopped_off; + current_prefix_length = pn->pos + pn->bits + - chopped_off; /* * Either we do the actual chop off according or if we have @@ -1528,52 +1543,23 @@ found: return ret; } -/* only called from updater side */ -static int trie_leaf_remove(struct trie *t, t_key key) +/* + * Remove the leaf and return parent. + */ +static void trie_leaf_remove(struct trie *t, struct leaf *l) { - t_key cindex; - struct tnode *tp = NULL; - struct node *n = t->trie; - struct leaf *l; - - pr_debug("entering trie_leaf_remove(%p)\n", n); - - /* Note that in the case skipped bits, those bits are *not* checked! - * When we finish this, we will have NULL or a T_LEAF, and the - * T_LEAF may or may not match our key. - */ - - while (n != NULL && IS_TNODE(n)) { - struct tnode *tn = (struct tnode *) n; - check_tnode(tn); - n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits)); - - BUG_ON(n && node_parent(n) != tn); - } - l = (struct leaf *) n; - - if (!n || !tkey_equals(l->key, key)) - return 0; - - /* - * Key found. - * Remove the leaf and rebalance the tree - */ - - t->revision++; - t->size--; + struct tnode *tp = node_parent((struct node *) l); - tp = node_parent(n); - tnode_free((struct tnode *) n); + pr_debug("entering trie_leaf_remove(%p)\n", l); if (tp) { - cindex = tkey_extract_bits(key, tp->pos, tp->bits); + t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits); put_child(t, (struct tnode *)tp, cindex, NULL); rcu_assign_pointer(t->trie, trie_rebalance(t, tp)); } else rcu_assign_pointer(t->trie, NULL); - return 1; + tnode_free((struct tnode *) l); } /* @@ -1651,7 +1637,7 @@ static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg) } if (hlist_empty(&l->list)) - trie_leaf_remove(t, key); + trie_leaf_remove(t, l); if (fa->fa_state & FA_S_ACCESSED) rt_cache_flush(-1); @@ -1697,64 +1683,64 @@ static int trie_flush_leaf(struct trie *t, struct leaf *l) return found; } -/* rcu_read_lock needs to be hold by caller from readside */ - -static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf) +/* + * Scan for the next right leaf starting at node p->child[idx] + * Since we have back pointer, no recursion necessary. + */ +static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c) { - struct node *c = (struct node *) thisleaf; - struct tnode *p; - int idx; - struct node *trie = rcu_dereference(t->trie); - - if (c == NULL) { - if (trie == NULL) - return NULL; - - if (IS_LEAF(trie)) /* trie w. just a leaf */ - return (struct leaf *) trie; - - p = (struct tnode*) trie; /* Start */ - } else - p = node_parent(c); - - while (p) { - int pos, last; + do { + t_key idx; - /* Find the next child of the parent */ if (c) - pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits); + idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1; else - pos = 0; - - last = 1 << p->bits; - for (idx = pos; idx < last ; idx++) { - c = rcu_dereference(p->child[idx]); + idx = 0; + while (idx < 1u << p->bits) { + c = tnode_get_child_rcu(p, idx++); if (!c) continue; - /* Decend if tnode */ - while (IS_TNODE(c)) { - p = (struct tnode *) c; - idx = 0; - - /* Rightmost non-NULL branch */ - if (p && IS_TNODE(p)) - while (!(c = rcu_dereference(p->child[idx])) - && idx < (1<<p->bits)) idx++; - - /* Done with this tnode? */ - if (idx >= (1 << p->bits) || !c) - goto up; + if (IS_LEAF(c)) { + prefetch(p->child[idx]); + return (struct leaf *) c; } - return (struct leaf *) c; + + /* Rescan start scanning in new node */ + p = (struct tnode *) c; + idx = 0; } -up: - /* No more children go up one step */ + + /* Node empty, walk back up to parent */ c = (struct node *) p; - p = node_parent(c); - } - return NULL; /* Ready. Root of trie */ + } while ( (p = node_parent_rcu(c)) != NULL); + + return NULL; /* Root of trie */ +} + +static struct leaf *trie_firstleaf(struct trie *t) +{ + struct tnode *n = (struct tnode *) rcu_dereference(t->trie); + + if (!n) + return NULL; + + if (IS_LEAF(n)) /* trie is just a leaf */ + return (struct leaf *) n; + + return leaf_walk_rcu(n, NULL); +} + +static struct leaf *trie_nextleaf(struct leaf *l) +{ + struct node *c = (struct node *) l; + struct tnode *p = node_parent(c); + + if (!p) + return NULL; /* trie with just one leaf */ + + return leaf_walk_rcu(p, c); } /* @@ -1763,30 +1749,27 @@ up: static int fn_trie_flush(struct fib_table *tb) { struct trie *t = (struct trie *) tb->tb_data; - struct leaf *ll = NULL, *l = NULL; - int found = 0, h; - - t->revision++; + struct leaf *l, *ll = NULL; + int found = 0; - for (h = 0; (l = nextleaf(t, l)) != NULL; h++) { + for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) { found += trie_flush_leaf(t, l); if (ll && hlist_empty(&ll->list)) - trie_leaf_remove(t, ll->key); + trie_leaf_remove(t, ll); ll = l; } if (ll && hlist_empty(&ll->list)) - trie_leaf_remove(t, ll->key); + trie_leaf_remove(t, ll); pr_debug("trie_flush found=%d\n", found); return found; } -static int trie_last_dflt = -1; - -static void -fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res) +static void fn_trie_select_default(struct fib_table *tb, + const struct flowi *flp, + struct fib_result *res) { struct trie *t = (struct trie *) tb->tb_data; int order, last_idx; @@ -1831,48 +1814,38 @@ fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib if (next_fi != res->fi) break; } else if (!fib_detect_death(fi, order, &last_resort, - &last_idx, &trie_last_dflt)) { - if (res->fi) - fib_info_put(res->fi); - res->fi = fi; - atomic_inc(&fi->fib_clntref); - trie_last_dflt = order; + &last_idx, tb->tb_default)) { + fib_result_assign(res, fi); + tb->tb_default = order; goto out; } fi = next_fi; order++; } if (order <= 0 || fi == NULL) { - trie_last_dflt = -1; + tb->tb_default = -1; goto out; } - if (!fib_detect_death(fi, order, &last_resort, &last_idx, &trie_last_dflt)) { - if (res->fi) - fib_info_put(res->fi); - res->fi = fi; - atomic_inc(&fi->fib_clntref); - trie_last_dflt = order; + if (!fib_detect_death(fi, order, &last_resort, &last_idx, + tb->tb_default)) { + fib_result_assign(res, fi); + tb->tb_default = order; goto out; } - if (last_idx >= 0) { - if (res->fi) - fib_info_put(res->fi); - res->fi = last_resort; - if (last_resort) - atomic_inc(&last_resort->fib_clntref); - } - trie_last_dflt = last_idx; - out:; + if (last_idx >= 0) + fib_result_assign(res, last_resort); + tb->tb_default = last_idx; +out: rcu_read_unlock(); } -static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb, +static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, + struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb) { int i, s_i; struct fib_alias *fa; - __be32 xkey = htonl(key); s_i = cb->args[4]; @@ -1885,7 +1858,6 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fi i++; continue; } - BUG_ON(!fa->fa_info); if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, @@ -1896,7 +1868,7 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fi xkey, plen, fa->fa_tos, - fa->fa_info, 0) < 0) { + fa->fa_info, NLM_F_MULTI) < 0) { cb->args[4] = i; return -1; } @@ -1906,109 +1878,118 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fi return skb->len; } -static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb, - struct netlink_callback *cb) +static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb, + struct sk_buff *skb, struct netlink_callback *cb) { - int h, s_h; - struct list_head *fa_head; - struct leaf *l = NULL; + struct leaf_info *li; + struct hlist_node *node; + int i, s_i; - s_h = cb->args[3]; + s_i = cb->args[3]; + i = 0; - for (h = 0; (l = nextleaf(t, l)) != NULL; h++) { - if (h < s_h) + /* rcu_read_lock is hold by caller */ + hlist_for_each_entry_rcu(li, node, &l->list, hlist) { + if (i < s_i) { + i++; continue; - if (h > s_h) - memset(&cb->args[4], 0, - sizeof(cb->args) - 4*sizeof(cb->args[0])); - - fa_head = get_fa_head(l, plen); + } - if (!fa_head) - continue; + if (i > s_i) + cb->args[4] = 0; - if (list_empty(fa_head)) + if (list_empty(&li->falh)) continue; - if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) { - cb->args[3] = h; + if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) { + cb->args[3] = i; return -1; } + i++; } - cb->args[3] = h; + + cb->args[3] = i; return skb->len; } -static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb) +static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, + struct netlink_callback *cb) { - int m, s_m; + struct leaf *l; struct trie *t = (struct trie *) tb->tb_data; - - s_m = cb->args[2]; + t_key key = cb->args[2]; rcu_read_lock(); - for (m = 0; m <= 32; m++) { - if (m < s_m) - continue; - if (m > s_m) - memset(&cb->args[3], 0, - sizeof(cb->args) - 3*sizeof(cb->args[0])); + /* Dump starting at last key. + * Note: 0.0.0.0/0 (ie default) is first key. + */ + if (!key) + l = trie_firstleaf(t); + else { + l = fib_find_node(t, key); + if (!l) { + /* The table changed during the dump, rather than + * giving partial data, just make application retry. + */ + rcu_read_unlock(); + return -EBUSY; + } + } - if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) { - cb->args[2] = m; - goto out; + while (l) { + cb->args[2] = l->key; + if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) { + rcu_read_unlock(); + return -1; } + + l = trie_nextleaf(l); + memset(&cb->args[3], 0, + sizeof(cb->args) - 3*sizeof(cb->args[0])); } rcu_read_unlock(); - cb->args[2] = m; + return skb->len; -out: - rcu_read_unlock(); - return -1; } -/* Fix more generic FIB names for init later */ +void __init fib_hash_init(void) +{ + fn_alias_kmem = kmem_cache_create("ip_fib_alias", + sizeof(struct fib_alias), + 0, SLAB_PANIC, NULL); -#ifdef CONFIG_IP_MULTIPLE_TABLES -struct fib_table * fib_hash_init(u32 id) -#else -struct fib_table * __init fib_hash_init(u32 id) -#endif + trie_leaf_kmem = kmem_cache_create("ip_fib_trie", + max(sizeof(struct leaf), + sizeof(struct leaf_info)), + 0, SLAB_PANIC, NULL); +} + + +/* Fix more generic FIB names for init later */ +struct fib_table *fib_hash_table(u32 id) { struct fib_table *tb; struct trie *t; - if (fn_alias_kmem == NULL) - fn_alias_kmem = kmem_cache_create("ip_fib_alias", - sizeof(struct fib_alias), - 0, SLAB_HWCACHE_ALIGN, - NULL); - tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie), GFP_KERNEL); if (tb == NULL) return NULL; tb->tb_id = id; + tb->tb_default = -1; tb->tb_lookup = fn_trie_lookup; tb->tb_insert = fn_trie_insert; tb->tb_delete = fn_trie_delete; tb->tb_flush = fn_trie_flush; tb->tb_select_default = fn_trie_select_default; tb->tb_dump = fn_trie_dump; - memset(tb->tb_data, 0, sizeof(struct trie)); t = (struct trie *) tb->tb_data; - - trie_init(t); - - if (id == RT_TABLE_LOCAL) - trie_local = t; - else if (id == RT_TABLE_MAIN) - trie_main = t; + memset(t, 0, sizeof(*t)); if (id == RT_TABLE_LOCAL) - printk(KERN_INFO "IPv4 FIB: Using LC-trie version %s\n", VERSION); + pr_info("IPv4 FIB: Using LC-trie version %s\n", VERSION); return tb; } @@ -2016,6 +1997,8 @@ struct fib_table * __init fib_hash_init(u32 id) #ifdef CONFIG_PROC_FS /* Depth first Trie walk iterator */ struct fib_trie_iter { + struct seq_net_private p; + struct trie *trie_local, *trie_main; struct tnode *tnode; struct trie *trie; unsigned index; @@ -2036,7 +2019,7 @@ static struct node *fib_trie_get_next(struct fib_trie_iter *iter) iter->tnode, iter->index, iter->depth); rescan: while (cindex < (1<<tn->bits)) { - struct node *n = tnode_get_child(tn, cindex); + struct node *n = tnode_get_child_rcu(tn, cindex); if (n) { if (IS_LEAF(n)) { @@ -2055,7 +2038,7 @@ rescan: } /* Current node exhausted, pop back up */ - p = node_parent((struct node *)tn); + p = node_parent_rcu((struct node *)tn); if (p) { cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1; tn = p; @@ -2108,10 +2091,17 @@ static void trie_collect_stats(struct trie *t, struct trie_stat *s) for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) { if (IS_LEAF(n)) { + struct leaf *l = (struct leaf *)n; + struct leaf_info *li; + struct hlist_node *tmp; + s->leaves++; s->totdepth += iter.depth; if (iter.depth > s->maxdepth) s->maxdepth = iter.depth; + + hlist_for_each_entry_rcu(li, tmp, &l->list, hlist) + ++s->prefixes; } else { const struct tnode *tn = (const struct tnode *) n; int i; @@ -2140,13 +2130,17 @@ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat) else avdepth = 0; - seq_printf(seq, "\tAver depth: %d.%02d\n", avdepth / 100, avdepth % 100 ); + seq_printf(seq, "\tAver depth: %u.%02d\n", + avdepth / 100, avdepth % 100); seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth); seq_printf(seq, "\tLeaves: %u\n", stat->leaves); - bytes = sizeof(struct leaf) * stat->leaves; - seq_printf(seq, "\tInternal nodes: %d\n\t", stat->tnodes); + + seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes); + bytes += sizeof(struct leaf_info) * stat->prefixes; + + seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes); bytes += sizeof(struct tnode) * stat->tnodes; max = MAX_STAT_DEPTH; @@ -2156,60 +2150,89 @@ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat) pointers = 0; for (i = 1; i <= max; i++) if (stat->nodesizes[i] != 0) { - seq_printf(seq, " %d: %d", i, stat->nodesizes[i]); + seq_printf(seq, " %u: %u", i, stat->nodesizes[i]); pointers += (1<<i) * stat->nodesizes[i]; } seq_putc(seq, '\n'); - seq_printf(seq, "\tPointers: %d\n", pointers); + seq_printf(seq, "\tPointers: %u\n", pointers); bytes += sizeof(struct node *) * pointers; - seq_printf(seq, "Null ptrs: %d\n", stat->nullpointers); - seq_printf(seq, "Total size: %d kB\n", (bytes + 1023) / 1024); + seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers); + seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024); +} #ifdef CONFIG_IP_FIB_TRIE_STATS - seq_printf(seq, "Counters:\n---------\n"); - seq_printf(seq,"gets = %d\n", t->stats.gets); - seq_printf(seq,"backtracks = %d\n", t->stats.backtrack); - seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed); - seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss); - seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit); - seq_printf(seq,"skipped node resize = %d\n", t->stats.resize_node_skipped); -#ifdef CLEAR_STATS - memset(&(t->stats), 0, sizeof(t->stats)); -#endif +static void trie_show_usage(struct seq_file *seq, + const struct trie_use_stats *stats) +{ + seq_printf(seq, "\nCounters:\n---------\n"); + seq_printf(seq, "gets = %u\n", stats->gets); + seq_printf(seq, "backtracks = %u\n", stats->backtrack); + seq_printf(seq, "semantic match passed = %u\n", + stats->semantic_match_passed); + seq_printf(seq, "semantic match miss = %u\n", + stats->semantic_match_miss); + seq_printf(seq, "null node hit= %u\n", stats->null_node_hit); + seq_printf(seq, "skipped node resize = %u\n\n", + stats->resize_node_skipped); +} #endif /* CONFIG_IP_FIB_TRIE_STATS */ + +static void fib_trie_show(struct seq_file *seq, const char *name, + struct trie *trie) +{ + struct trie_stat stat; + + trie_collect_stats(trie, &stat); + seq_printf(seq, "%s:\n", name); + trie_show_stats(seq, &stat); +#ifdef CONFIG_IP_FIB_TRIE_STATS + trie_show_usage(seq, &trie->stats); +#endif } static int fib_triestat_seq_show(struct seq_file *seq, void *v) { - struct trie_stat *stat; - - stat = kmalloc(sizeof(*stat), GFP_KERNEL); - if (!stat) - return -ENOMEM; + struct net *net = (struct net *)seq->private; + struct fib_table *tb; - seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n", + seq_printf(seq, + "Basic info: size of leaf:" + " %Zd bytes, size of tnode: %Zd bytes.\n", sizeof(struct leaf), sizeof(struct tnode)); - if (trie_local) { - seq_printf(seq, "Local:\n"); - trie_collect_stats(trie_local, stat); - trie_show_stats(seq, stat); - } + tb = fib_get_table(net, RT_TABLE_LOCAL); + if (tb) + fib_trie_show(seq, "Local", (struct trie *) tb->tb_data); - if (trie_main) { - seq_printf(seq, "Main:\n"); - trie_collect_stats(trie_main, stat); - trie_show_stats(seq, stat); - } - kfree(stat); + tb = fib_get_table(net, RT_TABLE_MAIN); + if (tb) + fib_trie_show(seq, "Main", (struct trie *) tb->tb_data); return 0; } static int fib_triestat_seq_open(struct inode *inode, struct file *file) { - return single_open(file, fib_triestat_seq_show, NULL); + int err; + struct net *net; + + net = get_proc_net(inode); + if (net == NULL) + return -ENXIO; + err = single_open(file, fib_triestat_seq_show, net); + if (err < 0) { + put_net(net); + return err; + } + return 0; +} + +static int fib_triestat_seq_release(struct inode *ino, struct file *f) +{ + struct seq_file *seq = f->private_data; + put_net(seq->private); + return single_release(ino, f); } static const struct file_operations fib_triestat_fops = { @@ -2217,7 +2240,7 @@ static const struct file_operations fib_triestat_fops = { .open = fib_triestat_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .release = fib_triestat_seq_release, }; static struct node *fib_trie_get_idx(struct fib_trie_iter *iter, @@ -2226,13 +2249,13 @@ static struct node *fib_trie_get_idx(struct fib_trie_iter *iter, loff_t idx = 0; struct node *n; - for (n = fib_trie_get_first(iter, trie_local); + for (n = fib_trie_get_first(iter, iter->trie_local); n; ++idx, n = fib_trie_get_next(iter)) { if (pos == idx) return n; } - for (n = fib_trie_get_first(iter, trie_main); + for (n = fib_trie_get_first(iter, iter->trie_main); n; ++idx, n = fib_trie_get_next(iter)) { if (pos == idx) return n; @@ -2241,11 +2264,25 @@ static struct node *fib_trie_get_idx(struct fib_trie_iter *iter, } static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(RCU) { + struct fib_trie_iter *iter = seq->private; + struct fib_table *tb; + + if (!iter->trie_local) { + tb = fib_get_table(iter->p.net, RT_TABLE_LOCAL); + if (tb) + iter->trie_local = (struct trie *) tb->tb_data; + } + if (!iter->trie_main) { + tb = fib_get_table(iter->p.net, RT_TABLE_MAIN); + if (tb) + iter->trie_main = (struct trie *) tb->tb_data; + } rcu_read_lock(); if (*pos == 0) return SEQ_START_TOKEN; - return fib_trie_get_idx(seq->private, *pos - 1); + return fib_trie_get_idx(iter, *pos - 1); } static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos) @@ -2263,13 +2300,14 @@ static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos) return v; /* continue scan in next trie */ - if (iter->trie == trie_local) - return fib_trie_get_first(iter, trie_main); + if (iter->trie == iter->trie_local) + return fib_trie_get_first(iter, iter->trie_main); return NULL; } static void fib_trie_seq_stop(struct seq_file *seq, void *v) + __releases(RCU) { rcu_read_unlock(); } @@ -2279,10 +2317,8 @@ static void seq_indent(struct seq_file *seq, int n) while (n-- > 0) seq_puts(seq, " "); } -static inline const char *rtn_scope(enum rt_scope_t s) +static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s) { - static char buf[32]; - switch (s) { case RT_SCOPE_UNIVERSE: return "universe"; case RT_SCOPE_SITE: return "site"; @@ -2290,7 +2326,7 @@ static inline const char *rtn_scope(enum rt_scope_t s) case RT_SCOPE_HOST: return "host"; case RT_SCOPE_NOWHERE: return "nowhere"; default: - snprintf(buf, sizeof(buf), "scope=%d", s); + snprintf(buf, len, "scope=%d", s); return buf; } } @@ -2310,13 +2346,11 @@ static const char *rtn_type_names[__RTN_MAX] = { [RTN_XRESOLVE] = "XRESOLVE", }; -static inline const char *rtn_type(unsigned t) +static inline const char *rtn_type(char *buf, size_t len, unsigned t) { - static char buf[32]; - if (t < __RTN_MAX && rtn_type_names[t]) return rtn_type_names[t]; - snprintf(buf, sizeof(buf), "type %d", t); + snprintf(buf, len, "type %u", t); return buf; } @@ -2329,8 +2363,8 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v) if (v == SEQ_START_TOKEN) return 0; - if (!node_parent(n)) { - if (iter->trie == trie_local) + if (!node_parent_rcu(n)) { + if (iter->trie == iter->trie_local) seq_puts(seq, "<local>:\n"); else seq_puts(seq, "<main>:\n"); @@ -2347,25 +2381,29 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v) } else { struct leaf *l = (struct leaf *) n; - int i; + struct leaf_info *li; + struct hlist_node *node; __be32 val = htonl(l->key); seq_indent(seq, iter->depth); seq_printf(seq, " |-- %d.%d.%d.%d\n", NIPQUAD(val)); - for (i = 32; i >= 0; i--) { - struct leaf_info *li = find_leaf_info(l, i); - if (li) { - struct fib_alias *fa; - list_for_each_entry_rcu(fa, &li->falh, fa_list) { - seq_indent(seq, iter->depth+1); - seq_printf(seq, " /%d %s %s", i, - rtn_scope(fa->fa_scope), - rtn_type(fa->fa_type)); - if (fa->fa_tos) - seq_printf(seq, "tos =%d\n", - fa->fa_tos); - seq_putc(seq, '\n'); - } + + hlist_for_each_entry_rcu(li, node, &l->list, hlist) { + struct fib_alias *fa; + + list_for_each_entry_rcu(fa, &li->falh, fa_list) { + char buf1[32], buf2[32]; + + seq_indent(seq, iter->depth+1); + seq_printf(seq, " /%d %s %s", li->plen, + rtn_scope(buf1, sizeof(buf1), + fa->fa_scope), + rtn_type(buf2, sizeof(buf2), + fa->fa_type)); + if (fa->fa_tos) + seq_printf(seq, "tos =%d\n", + fa->fa_tos); + seq_putc(seq, '\n'); } } } @@ -2382,8 +2420,8 @@ static const struct seq_operations fib_trie_seq_ops = { static int fib_trie_seq_open(struct inode *inode, struct file *file) { - return seq_open_private(file, &fib_trie_seq_ops, - sizeof(struct fib_trie_iter)); + return seq_open_net(inode, file, &fib_trie_seq_ops, + sizeof(struct fib_trie_iter)); } static const struct file_operations fib_trie_fops = { @@ -2391,7 +2429,7 @@ static const struct file_operations fib_trie_fops = { .open = fib_trie_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = seq_release_net, }; static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi) @@ -2419,8 +2457,8 @@ static int fib_route_seq_show(struct seq_file *seq, void *v) { const struct fib_trie_iter *iter = seq->private; struct leaf *l = v; - int i; - char bf[128]; + struct leaf_info *li; + struct hlist_node *node; if (v == SEQ_START_TOKEN) { seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway " @@ -2429,25 +2467,23 @@ static int fib_route_seq_show(struct seq_file *seq, void *v) return 0; } - if (iter->trie == trie_local) + if (iter->trie == iter->trie_local) return 0; + if (IS_TNODE(l)) return 0; - for (i=32; i>=0; i--) { - struct leaf_info *li = find_leaf_info(l, i); + hlist_for_each_entry_rcu(li, node, &l->list, hlist) { struct fib_alias *fa; __be32 mask, prefix; - if (!li) - continue; - mask = inet_make_mask(li->plen); prefix = htonl(l->key); list_for_each_entry_rcu(fa, &li->falh, fa_list) { const struct fib_info *fi = fa->fa_info; unsigned flags = fib_flag_trans(fa->fa_type, mask, fi); + char bf[128]; if (fa->fa_type == RTN_BROADCAST || fa->fa_type == RTN_MULTICAST) @@ -2461,7 +2497,8 @@ static int fib_route_seq_show(struct seq_file *seq, void *v) fi->fib_nh->nh_gw, flags, 0, 0, fi->fib_priority, mask, - (fi->fib_advmss ? fi->fib_advmss + 40 : 0), + (fi->fib_advmss ? + fi->fib_advmss + 40 : 0), fi->fib_window, fi->fib_rtt >> 3); else @@ -2486,8 +2523,8 @@ static const struct seq_operations fib_route_seq_ops = { static int fib_route_seq_open(struct inode *inode, struct file *file) { - return seq_open_private(file, &fib_route_seq_ops, - sizeof(struct fib_trie_iter)); + return seq_open_net(inode, file, &fib_route_seq_ops, + sizeof(struct fib_trie_iter)); } static const struct file_operations fib_route_fops = { @@ -2495,35 +2532,36 @@ static const struct file_operations fib_route_fops = { .open = fib_route_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = seq_release_net, }; -int __init fib_proc_init(void) +int __net_init fib_proc_init(struct net *net) { - if (!proc_net_fops_create(&init_net, "fib_trie", S_IRUGO, &fib_trie_fops)) + if (!proc_net_fops_create(net, "fib_trie", S_IRUGO, &fib_trie_fops)) goto out1; - if (!proc_net_fops_create(&init_net, "fib_triestat", S_IRUGO, &fib_triestat_fops)) + if (!proc_net_fops_create(net, "fib_triestat", S_IRUGO, + &fib_triestat_fops)) goto out2; - if (!proc_net_fops_create(&init_net, "route", S_IRUGO, &fib_route_fops)) + if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_route_fops)) goto out3; return 0; out3: - proc_net_remove(&init_net, "fib_triestat"); + proc_net_remove(net, "fib_triestat"); out2: - proc_net_remove(&init_net, "fib_trie"); + proc_net_remove(net, "fib_trie"); out1: return -ENOMEM; } -void __init fib_proc_exit(void) +void __net_exit fib_proc_exit(struct net *net) { - proc_net_remove(&init_net, "fib_trie"); - proc_net_remove(&init_net, "fib_triestat"); - proc_net_remove(&init_net, "route"); + proc_net_remove(net, "fib_trie"); + proc_net_remove(net, "fib_triestat"); + proc_net_remove(net, "route"); } #endif /* CONFIG_PROC_FS */ diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 82baea026484..a7321a82df6d 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -92,6 +92,7 @@ #include <asm/system.h> #include <asm/uaccess.h> #include <net/checksum.h> +#include <net/xfrm.h> /* * Build xmit assembly blocks @@ -231,7 +232,7 @@ static const struct icmp_control icmp_pointers[NR_ICMP_TYPES+1]; static DEFINE_PER_CPU(struct socket *, __icmp_socket) = NULL; #define icmp_socket __get_cpu_var(__icmp_socket) -static __inline__ int icmp_xmit_lock(void) +static inline int icmp_xmit_lock(void) { local_bh_disable(); @@ -245,7 +246,7 @@ static __inline__ int icmp_xmit_lock(void) return 0; } -static void icmp_xmit_unlock(void) +static inline void icmp_xmit_unlock(void) { spin_unlock_bh(&icmp_socket->sk->sk_lock.slock); } @@ -274,18 +275,19 @@ static void icmp_xmit_unlock(void) #define XRLIM_BURST_FACTOR 6 int xrlim_allow(struct dst_entry *dst, int timeout) { - unsigned long now; + unsigned long now, token = dst->rate_tokens; int rc = 0; now = jiffies; - dst->rate_tokens += now - dst->rate_last; + token += now - dst->rate_last; dst->rate_last = now; - if (dst->rate_tokens > XRLIM_BURST_FACTOR * timeout) - dst->rate_tokens = XRLIM_BURST_FACTOR * timeout; - if (dst->rate_tokens >= timeout) { - dst->rate_tokens -= timeout; + if (token > XRLIM_BURST_FACTOR * timeout) + token = XRLIM_BURST_FACTOR * timeout; + if (token >= timeout) { + token -= timeout; rc = 1; } + dst->rate_tokens = token; return rc; } @@ -403,7 +405,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) .tos = RT_TOS(ip_hdr(skb)->tos) } }, .proto = IPPROTO_ICMP }; security_skb_classify_flow(skb, &fl); - if (ip_route_output_key(&rt, &fl)) + if (ip_route_output_key(rt->u.dst.dev->nd_net, &rt, &fl)) goto out_unlock; } if (icmpv4_xrlim_allow(rt, icmp_param->data.icmph.type, @@ -435,9 +437,11 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) struct ipcm_cookie ipc; __be32 saddr; u8 tos; + struct net *net; if (!rt) goto out; + net = rt->u.dst.dev->nd_net; /* * Find the original header. It is expected to be valid, of course. @@ -513,7 +517,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) struct net_device *dev = NULL; if (rt->fl.iif && sysctl_icmp_errors_use_inbound_ifaddr) - dev = dev_get_by_index(&init_net, rt->fl.iif); + dev = dev_get_by_index(net, rt->fl.iif); if (dev) { saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK); @@ -563,11 +567,71 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) } } }; + int err; + struct rtable *rt2; + security_skb_classify_flow(skb_in, &fl); - if (ip_route_output_key(&rt, &fl)) + if (__ip_route_output_key(net, &rt, &fl)) + goto out_unlock; + + /* No need to clone since we're just using its address. */ + rt2 = rt; + + err = xfrm_lookup((struct dst_entry **)&rt, &fl, NULL, 0); + switch (err) { + case 0: + if (rt != rt2) + goto route_done; + break; + case -EPERM: + rt = NULL; + break; + default: + goto out_unlock; + } + + if (xfrm_decode_session_reverse(skb_in, &fl, AF_INET)) + goto out_unlock; + + if (inet_addr_type(net, fl.fl4_src) == RTN_LOCAL) + err = __ip_route_output_key(net, &rt2, &fl); + else { + struct flowi fl2 = {}; + struct dst_entry *odst; + + fl2.fl4_dst = fl.fl4_src; + if (ip_route_output_key(net, &rt2, &fl2)) + goto out_unlock; + + /* Ugh! */ + odst = skb_in->dst; + err = ip_route_input(skb_in, fl.fl4_dst, fl.fl4_src, + RT_TOS(tos), rt2->u.dst.dev); + + dst_release(&rt2->u.dst); + rt2 = (struct rtable *)skb_in->dst; + skb_in->dst = odst; + } + + if (err) + goto out_unlock; + + err = xfrm_lookup((struct dst_entry **)&rt2, &fl, NULL, + XFRM_LOOKUP_ICMP); + if (err == -ENOENT) { + if (!rt) + goto out_unlock; + goto route_done; + } + + dst_release(&rt->u.dst); + rt = rt2; + + if (err) goto out_unlock; } +route_done: if (!icmpv4_xrlim_allow(rt, type, code)) goto ende; @@ -603,8 +667,10 @@ static void icmp_unreach(struct sk_buff *skb) struct icmphdr *icmph; int hash, protocol; struct net_protocol *ipprot; - struct sock *raw_sk; u32 info = 0; + struct net *net; + + net = skb->dst->dev->nd_net; /* * Incomplete header ? @@ -635,7 +701,7 @@ static void icmp_unreach(struct sk_buff *skb) "and DF set.\n", NIPQUAD(iph->daddr)); } else { - info = ip_rt_frag_needed(iph, + info = ip_rt_frag_needed(net, iph, ntohs(icmph->un.frag.mtu)); if (!info) goto out; @@ -673,7 +739,7 @@ static void icmp_unreach(struct sk_buff *skb) */ if (!sysctl_icmp_ignore_bogus_error_responses && - inet_addr_type(iph->daddr) == RTN_BROADCAST) { + inet_addr_type(net, iph->daddr) == RTN_BROADCAST) { if (net_ratelimit()) printk(KERN_WARNING "%u.%u.%u.%u sent an invalid ICMP " "type %u, code %u " @@ -697,21 +763,9 @@ static void icmp_unreach(struct sk_buff *skb) /* * Deliver ICMP message to raw sockets. Pretty useless feature? */ + raw_icmp_error(skb, protocol, info); - /* Note: See raw.c and net/raw.h, RAWV4_HTABLE_SIZE==MAX_INET_PROTOS */ hash = protocol & (MAX_INET_PROTOS - 1); - read_lock(&raw_v4_lock); - if ((raw_sk = sk_head(&raw_v4_htable[hash])) != NULL) { - while ((raw_sk = __raw_v4_lookup(raw_sk, protocol, iph->daddr, - iph->saddr, - skb->dev->ifindex)) != NULL) { - raw_err(raw_sk, skb, info); - raw_sk = sk_next(raw_sk); - iph = (struct iphdr *)skb->data; - } - } - read_unlock(&raw_v4_lock); - rcu_read_lock(); ipprot = rcu_dereference(inet_protos[hash]); if (ipprot && ipprot->err_handler) @@ -929,6 +983,25 @@ int icmp_rcv(struct sk_buff *skb) struct icmphdr *icmph; struct rtable *rt = (struct rtable *)skb->dst; + if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { + int nh; + + if (!(skb->sp && skb->sp->xvec[skb->sp->len - 1]->props.flags & + XFRM_STATE_ICMP)) + goto drop; + + if (!pskb_may_pull(skb, sizeof(*icmph) + sizeof(struct iphdr))) + goto drop; + + nh = skb_network_offset(skb); + skb_set_network_header(skb, sizeof(*icmph)); + + if (!xfrm4_policy_check_reverse(NULL, XFRM_POLICY_IN, skb)) + goto drop; + + skb_set_network_header(skb, nh); + } + ICMP_INC_STATS_BH(ICMP_MIB_INMSGS); switch (skb->ip_summed) { @@ -942,8 +1015,7 @@ int icmp_rcv(struct sk_buff *skb) goto error; } - if (!pskb_pull(skb, sizeof(struct icmphdr))) - goto error; + __skb_pull(skb, sizeof(*icmph)); icmph = icmp_hdr(skb); diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 7dbc282d4f9f..994648be80ab 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -130,12 +130,12 @@ */ #define IGMP_V1_SEEN(in_dev) \ - (IPV4_DEVCONF_ALL(FORCE_IGMP_VERSION) == 1 || \ + (IPV4_DEVCONF_ALL(in_dev->dev->nd_net, FORCE_IGMP_VERSION) == 1 || \ IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 1 || \ ((in_dev)->mr_v1_seen && \ time_before(jiffies, (in_dev)->mr_v1_seen))) #define IGMP_V2_SEEN(in_dev) \ - (IPV4_DEVCONF_ALL(FORCE_IGMP_VERSION) == 2 || \ + (IPV4_DEVCONF_ALL(in_dev->dev->nd_net, FORCE_IGMP_VERSION) == 2 || \ IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 2 || \ ((in_dev)->mr_v2_seen && \ time_before(jiffies, (in_dev)->mr_v2_seen))) @@ -301,7 +301,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) .nl_u = { .ip4_u = { .daddr = IGMPV3_ALL_MCR } }, .proto = IPPROTO_IGMP }; - if (ip_route_output_key(&rt, &fl)) { + if (ip_route_output_key(&init_net, &rt, &fl)) { kfree_skb(skb); return NULL; } @@ -349,17 +349,12 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) static int igmpv3_sendpack(struct sk_buff *skb) { - struct iphdr *pip = ip_hdr(skb); struct igmphdr *pig = igmp_hdr(skb); - const int iplen = skb->tail - skb->network_header; const int igmplen = skb->tail - skb->transport_header; - pip->tot_len = htons(iplen); - ip_send_check(pip); pig->csum = ip_compute_csum(igmp_hdr(skb), igmplen); - return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, skb->dev, - dst_output); + return ip_local_out(skb); } static int grec_size(struct ip_mc_list *pmc, int type, int gdel, int sdel) @@ -650,7 +645,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, struct flowi fl = { .oif = dev->ifindex, .nl_u = { .ip4_u = { .daddr = dst } }, .proto = IPPROTO_IGMP }; - if (ip_route_output_key(&rt, &fl)) + if (ip_route_output_key(&init_net, &rt, &fl)) return -1; } if (rt->rt_src == 0) { @@ -680,13 +675,11 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, iph->daddr = dst; iph->saddr = rt->rt_src; iph->protocol = IPPROTO_IGMP; - iph->tot_len = htons(IGMP_SIZE); ip_select_ident(iph, &rt->u.dst, NULL); ((u8*)&iph[1])[0] = IPOPT_RA; ((u8*)&iph[1])[1] = 4; ((u8*)&iph[1])[2] = 0; ((u8*)&iph[1])[3] = 0; - ip_send_check(iph); ih = (struct igmphdr *)skb_put(skb, sizeof(struct igmphdr)); ih->type=type; @@ -695,8 +688,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, ih->group=group; ih->csum=ip_compute_csum((void *)ih, sizeof(struct igmphdr)); - return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev, - dst_output); + return ip_local_out(skb); } static void igmp_gq_timer_expire(unsigned long data) @@ -1234,9 +1226,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) spin_lock_init(&im->lock); #ifdef CONFIG_IP_MULTICAST im->tm_running=0; - init_timer(&im->timer); - im->timer.data=(unsigned long)im; - im->timer.function=&igmp_timer_expire; + setup_timer(&im->timer, &igmp_timer_expire, (unsigned long)im); im->unsolicit_count = IGMP_Unsolicited_Report_Count; im->reporter = 0; im->gsquery = 0; @@ -1338,13 +1328,11 @@ void ip_mc_init_dev(struct in_device *in_dev) in_dev->mc_tomb = NULL; #ifdef CONFIG_IP_MULTICAST in_dev->mr_gq_running = 0; - init_timer(&in_dev->mr_gq_timer); - in_dev->mr_gq_timer.data=(unsigned long) in_dev; - in_dev->mr_gq_timer.function=&igmp_gq_timer_expire; + setup_timer(&in_dev->mr_gq_timer, igmp_gq_timer_expire, + (unsigned long)in_dev); in_dev->mr_ifc_count = 0; - init_timer(&in_dev->mr_ifc_timer); - in_dev->mr_ifc_timer.data=(unsigned long) in_dev; - in_dev->mr_ifc_timer.function=&igmp_ifc_timer_expire; + setup_timer(&in_dev->mr_ifc_timer, igmp_ifc_timer_expire, + (unsigned long)in_dev); in_dev->mr_qrv = IGMP_Unsolicited_Report_Count; #endif @@ -1401,19 +1389,19 @@ static struct in_device * ip_mc_find_dev(struct ip_mreqn *imr) struct in_device *idev = NULL; if (imr->imr_ifindex) { - idev = inetdev_by_index(imr->imr_ifindex); + idev = inetdev_by_index(&init_net, imr->imr_ifindex); if (idev) __in_dev_put(idev); return idev; } if (imr->imr_address.s_addr) { - dev = ip_dev_find(imr->imr_address.s_addr); + dev = ip_dev_find(&init_net, imr->imr_address.s_addr); if (!dev) return NULL; dev_put(dev); } - if (!dev && !ip_route_output_key(&rt, &fl)) { + if (!dev && !ip_route_output_key(&init_net, &rt, &fl)) { dev = rt->u.dst.dev; ip_rt_put(rt); } @@ -1754,7 +1742,7 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr) int ifindex; int count = 0; - if (!MULTICAST(addr)) + if (!ipv4_is_multicast(addr)) return -EINVAL; rtnl_lock(); @@ -1867,7 +1855,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct int leavegroup = 0; int i, j, rv; - if (!MULTICAST(addr)) + if (!ipv4_is_multicast(addr)) return -EINVAL; rtnl_lock(); @@ -1997,7 +1985,7 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex) struct ip_sf_socklist *newpsl, *psl; int leavegroup = 0; - if (!MULTICAST(addr)) + if (!ipv4_is_multicast(addr)) return -EINVAL; if (msf->imsf_fmode != MCAST_INCLUDE && msf->imsf_fmode != MCAST_EXCLUDE) @@ -2080,7 +2068,7 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf, struct inet_sock *inet = inet_sk(sk); struct ip_sf_socklist *psl; - if (!MULTICAST(addr)) + if (!ipv4_is_multicast(addr)) return -EINVAL; rtnl_lock(); @@ -2142,7 +2130,7 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf, if (psin->sin_family != AF_INET) return -EINVAL; addr = psin->sin_addr.s_addr; - if (!MULTICAST(addr)) + if (!ipv4_is_multicast(addr)) return -EINVAL; rtnl_lock(); @@ -2192,7 +2180,7 @@ int ip_mc_sf_allow(struct sock *sk, __be32 loc_addr, __be32 rmt_addr, int dif) struct ip_sf_socklist *psl; int i; - if (!MULTICAST(loc_addr)) + if (!ipv4_is_multicast(loc_addr)) return 1; for (pmc=inet->mc_list; pmc; pmc=pmc->next) { @@ -2234,7 +2222,7 @@ void ip_mc_drop_socket(struct sock *sk) struct in_device *in_dev; inet->mc_list = iml->next; - in_dev = inetdev_by_index(iml->multi.imr_ifindex); + in_dev = inetdev_by_index(&init_net, iml->multi.imr_ifindex); (void) ip_mc_leave_src(sk, iml, in_dev); if (in_dev != NULL) { ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr); @@ -2341,6 +2329,7 @@ static struct ip_mc_list *igmp_mc_get_idx(struct seq_file *seq, loff_t pos) } static void *igmp_mc_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(dev_base_lock) { read_lock(&dev_base_lock); return *pos ? igmp_mc_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; @@ -2358,6 +2347,7 @@ static void *igmp_mc_seq_next(struct seq_file *seq, void *v, loff_t *pos) } static void igmp_mc_seq_stop(struct seq_file *seq, void *v) + __releases(dev_base_lock) { struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); if (likely(state->in_dev != NULL)) { diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 8fb6ca23700a..7801cceb2d1b 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -277,18 +277,11 @@ void inet_csk_init_xmit_timers(struct sock *sk, { struct inet_connection_sock *icsk = inet_csk(sk); - init_timer(&icsk->icsk_retransmit_timer); - init_timer(&icsk->icsk_delack_timer); - init_timer(&sk->sk_timer); - - icsk->icsk_retransmit_timer.function = retransmit_handler; - icsk->icsk_delack_timer.function = delack_handler; - sk->sk_timer.function = keepalive_handler; - - icsk->icsk_retransmit_timer.data = - icsk->icsk_delack_timer.data = - sk->sk_timer.data = (unsigned long)sk; - + setup_timer(&icsk->icsk_retransmit_timer, retransmit_handler, + (unsigned long)sk); + setup_timer(&icsk->icsk_delack_timer, delack_handler, + (unsigned long)sk); + setup_timer(&sk->sk_timer, keepalive_handler, (unsigned long)sk); icsk->icsk_pending = icsk->icsk_ack.pending = 0; } @@ -340,7 +333,7 @@ struct dst_entry* inet_csk_route_req(struct sock *sk, .dport = ireq->rmt_port } } }; security_req_classify_flow(req, &fl); - if (ip_route_output_flow(&rt, &fl, sk, 0)) { + if (ip_route_output_flow(&init_net, &rt, &fl, sk, 0)) { IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); return NULL; } diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index e468e7a7aac4..605ed2cd7972 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -935,7 +935,7 @@ out_free_table: static void __exit inet_diag_exit(void) { - sock_release(idiagnl->sk_socket); + netlink_kernel_release(idiagnl); kfree(inet_diag_table); } diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index e15e04fc6661..724d69aed031 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -47,7 +47,7 @@ static void inet_frag_secret_rebuild(unsigned long dummy) } write_unlock(&f->lock); - mod_timer(&f->secret_timer, now + f->ctl->secret_interval); + mod_timer(&f->secret_timer, now + f->secret_interval); } void inet_frags_init(struct inet_frags *f) @@ -57,35 +57,45 @@ void inet_frags_init(struct inet_frags *f) for (i = 0; i < INETFRAGS_HASHSZ; i++) INIT_HLIST_HEAD(&f->hash[i]); - INIT_LIST_HEAD(&f->lru_list); rwlock_init(&f->lock); f->rnd = (u32) ((num_physpages ^ (num_physpages>>7)) ^ (jiffies ^ (jiffies >> 6))); - f->nqueues = 0; - atomic_set(&f->mem, 0); - - init_timer(&f->secret_timer); - f->secret_timer.function = inet_frag_secret_rebuild; - f->secret_timer.data = (unsigned long)f; - f->secret_timer.expires = jiffies + f->ctl->secret_interval; + setup_timer(&f->secret_timer, inet_frag_secret_rebuild, + (unsigned long)f); + f->secret_timer.expires = jiffies + f->secret_interval; add_timer(&f->secret_timer); } EXPORT_SYMBOL(inet_frags_init); +void inet_frags_init_net(struct netns_frags *nf) +{ + nf->nqueues = 0; + atomic_set(&nf->mem, 0); + INIT_LIST_HEAD(&nf->lru_list); +} +EXPORT_SYMBOL(inet_frags_init_net); + void inet_frags_fini(struct inet_frags *f) { del_timer(&f->secret_timer); } EXPORT_SYMBOL(inet_frags_fini); +void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f) +{ + nf->low_thresh = 0; + inet_frag_evictor(nf, f); +} +EXPORT_SYMBOL(inet_frags_exit_net); + static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f) { write_lock(&f->lock); hlist_del(&fq->list); list_del(&fq->lru_list); - f->nqueues--; + fq->net->nqueues--; write_unlock(&f->lock); } @@ -103,13 +113,13 @@ void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f) EXPORT_SYMBOL(inet_frag_kill); -static inline void frag_kfree_skb(struct inet_frags *f, struct sk_buff *skb, - int *work) +static inline void frag_kfree_skb(struct netns_frags *nf, struct inet_frags *f, + struct sk_buff *skb, int *work) { if (work) *work -= skb->truesize; - atomic_sub(skb->truesize, &f->mem); + atomic_sub(skb->truesize, &nf->mem); if (f->skb_free) f->skb_free(skb); kfree_skb(skb); @@ -119,22 +129,24 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f, int *work) { struct sk_buff *fp; + struct netns_frags *nf; BUG_TRAP(q->last_in & COMPLETE); BUG_TRAP(del_timer(&q->timer) == 0); /* Release all fragment data. */ fp = q->fragments; + nf = q->net; while (fp) { struct sk_buff *xp = fp->next; - frag_kfree_skb(f, fp, work); + frag_kfree_skb(nf, f, fp, work); fp = xp; } if (work) *work -= f->qsize; - atomic_sub(f->qsize, &f->mem); + atomic_sub(f->qsize, &nf->mem); if (f->destructor) f->destructor(q); @@ -143,20 +155,20 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f, } EXPORT_SYMBOL(inet_frag_destroy); -int inet_frag_evictor(struct inet_frags *f) +int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f) { struct inet_frag_queue *q; int work, evicted = 0; - work = atomic_read(&f->mem) - f->ctl->low_thresh; + work = atomic_read(&nf->mem) - nf->low_thresh; while (work > 0) { read_lock(&f->lock); - if (list_empty(&f->lru_list)) { + if (list_empty(&nf->lru_list)) { read_unlock(&f->lock); break; } - q = list_first_entry(&f->lru_list, + q = list_first_entry(&nf->lru_list, struct inet_frag_queue, lru_list); atomic_inc(&q->refcnt); read_unlock(&f->lock); @@ -175,8 +187,9 @@ int inet_frag_evictor(struct inet_frags *f) } EXPORT_SYMBOL(inet_frag_evictor); -static struct inet_frag_queue *inet_frag_intern(struct inet_frag_queue *qp_in, - struct inet_frags *f, unsigned int hash, void *arg) +static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf, + struct inet_frag_queue *qp_in, struct inet_frags *f, + unsigned int hash, void *arg) { struct inet_frag_queue *qp; #ifdef CONFIG_SMP @@ -190,7 +203,7 @@ static struct inet_frag_queue *inet_frag_intern(struct inet_frag_queue *qp_in, * promoted read lock to write lock. */ hlist_for_each_entry(qp, n, &f->hash[hash], list) { - if (f->match(qp, arg)) { + if (qp->net == nf && f->match(qp, arg)) { atomic_inc(&qp->refcnt); write_unlock(&f->lock); qp_in->last_in |= COMPLETE; @@ -200,18 +213,19 @@ static struct inet_frag_queue *inet_frag_intern(struct inet_frag_queue *qp_in, } #endif qp = qp_in; - if (!mod_timer(&qp->timer, jiffies + f->ctl->timeout)) + if (!mod_timer(&qp->timer, jiffies + nf->timeout)) atomic_inc(&qp->refcnt); atomic_inc(&qp->refcnt); hlist_add_head(&qp->list, &f->hash[hash]); - list_add_tail(&qp->lru_list, &f->lru_list); - f->nqueues++; + list_add_tail(&qp->lru_list, &nf->lru_list); + nf->nqueues++; write_unlock(&f->lock); return qp; } -static struct inet_frag_queue *inet_frag_alloc(struct inet_frags *f, void *arg) +static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, + struct inet_frags *f, void *arg) { struct inet_frag_queue *q; @@ -220,35 +234,36 @@ static struct inet_frag_queue *inet_frag_alloc(struct inet_frags *f, void *arg) return NULL; f->constructor(q, arg); - atomic_add(f->qsize, &f->mem); + atomic_add(f->qsize, &nf->mem); setup_timer(&q->timer, f->frag_expire, (unsigned long)q); spin_lock_init(&q->lock); atomic_set(&q->refcnt, 1); + q->net = nf; return q; } -static struct inet_frag_queue *inet_frag_create(struct inet_frags *f, - void *arg, unsigned int hash) +static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf, + struct inet_frags *f, void *arg, unsigned int hash) { struct inet_frag_queue *q; - q = inet_frag_alloc(f, arg); + q = inet_frag_alloc(nf, f, arg); if (q == NULL) return NULL; - return inet_frag_intern(q, f, hash, arg); + return inet_frag_intern(nf, q, f, hash, arg); } -struct inet_frag_queue *inet_frag_find(struct inet_frags *f, void *key, - unsigned int hash) +struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, + struct inet_frags *f, void *key, unsigned int hash) { struct inet_frag_queue *q; struct hlist_node *n; read_lock(&f->lock); hlist_for_each_entry(q, n, &f->hash[hash], list) { - if (f->match(q, key)) { + if (q->net == nf && f->match(q, key)) { atomic_inc(&q->refcnt); read_unlock(&f->lock); return q; @@ -256,6 +271,6 @@ struct inet_frag_queue *inet_frag_find(struct inet_frags *f, void *key, } read_unlock(&f->lock); - return inet_frag_create(f, key, hash); + return inet_frag_create(nf, f, key, hash); } EXPORT_SYMBOL(inet_frag_find); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 67704da04fc4..619c63c6948a 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -96,6 +96,7 @@ EXPORT_SYMBOL(inet_put_port); * exclusive lock release). It should be ifdefed really. */ void inet_listen_wlock(struct inet_hashinfo *hashinfo) + __acquires(hashinfo->lhash_lock) { write_lock(&hashinfo->lhash_lock); @@ -190,6 +191,44 @@ sherry_cache: } EXPORT_SYMBOL_GPL(__inet_lookup_listener); +struct sock * __inet_lookup_established(struct inet_hashinfo *hashinfo, + const __be32 saddr, const __be16 sport, + const __be32 daddr, const u16 hnum, + const int dif) +{ + INET_ADDR_COOKIE(acookie, saddr, daddr) + const __portpair ports = INET_COMBINED_PORTS(sport, hnum); + struct sock *sk; + const struct hlist_node *node; + /* Optimize here for direct hit, only listening connections can + * have wildcards anyways. + */ + unsigned int hash = inet_ehashfn(daddr, hnum, saddr, sport); + struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash); + rwlock_t *lock = inet_ehash_lockp(hashinfo, hash); + + prefetch(head->chain.first); + read_lock(lock); + sk_for_each(sk, node, &head->chain) { + if (INET_MATCH(sk, hash, acookie, saddr, daddr, ports, dif)) + goto hit; /* You sunk my battleship! */ + } + + /* Must check for a TIME_WAIT'er before going to listener hash. */ + sk_for_each(sk, node, &head->twchain) { + if (INET_TW_MATCH(sk, hash, acookie, saddr, daddr, ports, dif)) + goto hit; + } + sk = NULL; +out: + read_unlock(lock); + return sk; +hit: + sock_hold(sk); + goto out; +} +EXPORT_SYMBOL_GPL(__inet_lookup_established); + /* called with local bh disabled */ static int __inet_check_established(struct inet_timewait_death_row *death_row, struct sock *sk, __u16 lport, @@ -239,7 +278,7 @@ unique: sk->sk_hash = hash; BUG_TRAP(sk_unhashed(sk)); __sk_add_node(sk, &head->chain); - sock_prot_inc_use(sk->sk_prot); + sock_prot_inuse_add(sk->sk_prot, 1); write_unlock(lock); if (twp) { @@ -267,6 +306,48 @@ static inline u32 inet_sk_port_offset(const struct sock *sk) inet->dport); } +void __inet_hash_nolisten(struct inet_hashinfo *hashinfo, struct sock *sk) +{ + struct hlist_head *list; + rwlock_t *lock; + struct inet_ehash_bucket *head; + + BUG_TRAP(sk_unhashed(sk)); + + sk->sk_hash = inet_sk_ehashfn(sk); + head = inet_ehash_bucket(hashinfo, sk->sk_hash); + list = &head->chain; + lock = inet_ehash_lockp(hashinfo, sk->sk_hash); + + write_lock(lock); + __sk_add_node(sk, list); + sock_prot_inuse_add(sk->sk_prot, 1); + write_unlock(lock); +} +EXPORT_SYMBOL_GPL(__inet_hash_nolisten); + +void __inet_hash(struct inet_hashinfo *hashinfo, struct sock *sk) +{ + struct hlist_head *list; + rwlock_t *lock; + + if (sk->sk_state != TCP_LISTEN) { + __inet_hash_nolisten(hashinfo, sk); + return; + } + + BUG_TRAP(sk_unhashed(sk)); + list = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; + lock = &hashinfo->lhash_lock; + + inet_listen_wlock(hashinfo); + __sk_add_node(sk, list); + sock_prot_inuse_add(sk->sk_prot, 1); + write_unlock(lock); + wake_up(&hashinfo->lhash_wait); +} +EXPORT_SYMBOL_GPL(__inet_hash); + /* * Bind a port for a connect operation and hash it. */ @@ -334,7 +415,7 @@ ok: inet_bind_hash(sk, tb, port); if (sk_unhashed(sk)) { inet_sk(sk)->sport = htons(port); - __inet_hash(hinfo, sk, 0); + __inet_hash_nolisten(hinfo, sk); } spin_unlock(&head->lock); @@ -351,7 +432,7 @@ ok: tb = inet_csk(sk)->icsk_bind_hash; spin_lock_bh(&head->lock); if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { - __inet_hash(hinfo, sk, 0); + __inet_hash_nolisten(hinfo, sk); spin_unlock_bh(&head->lock); return 0; } else { diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index a60b99e0ebdc..876169f3a528 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -48,6 +48,21 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw, inet_twsk_put(tw); } +void inet_twsk_put(struct inet_timewait_sock *tw) +{ + if (atomic_dec_and_test(&tw->tw_refcnt)) { + struct module *owner = tw->tw_prot->owner; + twsk_destructor((struct sock *)tw); +#ifdef SOCK_REFCNT_DEBUG + printk(KERN_DEBUG "%s timewait_sock %p released\n", + tw->tw_prot->name, tw); +#endif + kmem_cache_free(tw->tw_prot->twsk_prot->twsk_slab, tw); + module_put(owner); + } +} +EXPORT_SYMBOL_GPL(inet_twsk_put); + /* * Enter the time wait state. This is called with locally disabled BH. * Essentially we whip up a timewait bucket, copy the relevant info into it @@ -76,7 +91,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, /* Step 2: Remove SK from established hash. */ if (__sk_del_node_init(sk)) - sock_prot_dec_use(sk->sk_prot); + sock_prot_inuse_add(sk->sk_prot, -1); /* Step 3: Hash TW into TIMEWAIT chain. */ inet_twsk_add_node(tw, &ehead->twchain); @@ -194,16 +209,14 @@ out: EXPORT_SYMBOL_GPL(inet_twdr_hangman); -extern void twkill_slots_invalid(void); - void inet_twdr_twkill_work(struct work_struct *work) { struct inet_timewait_death_row *twdr = container_of(work, struct inet_timewait_death_row, twkill_work); int i; - if ((INET_TWDR_TWKILL_SLOTS - 1) > (sizeof(twdr->thread_slots) * 8)) - twkill_slots_invalid(); + BUILD_BUG_ON((INET_TWDR_TWKILL_SLOTS - 1) > + (sizeof(twdr->thread_slots) * 8)); while (twdr->thread_slots) { spin_lock_bh(&twdr->death_lock); diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 877da3ed52e2..0b3b328d82db 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -110,7 +110,7 @@ int ip_forward(struct sk_buff *skb) skb->priority = rt_tos2priority(iph->tos); - return NF_HOOK(PF_INET, NF_IP_FORWARD, skb, skb->dev, rt->u.dst.dev, + return NF_HOOK(PF_INET, NF_INET_FORWARD, skb, skb->dev, rt->u.dst.dev, ip_forward_finish); sr_failed: diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 2143bf30597a..a2e92f9709db 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -50,7 +50,7 @@ * as well. Or notify me, at least. --ANK */ -int sysctl_ipfrag_max_dist __read_mostly = 64; +static int sysctl_ipfrag_max_dist __read_mostly = 64; struct ipfrag_skb_cb { @@ -74,35 +74,16 @@ struct ipq { struct inet_peer *peer; }; -struct inet_frags_ctl ip4_frags_ctl __read_mostly = { - /* - * Fragment cache limits. We will commit 256K at one time. Should we - * cross that limit we will prune down to 192K. This should cope with - * even the most extreme cases without allowing an attacker to - * measurably harm machine performance. - */ - .high_thresh = 256 * 1024, - .low_thresh = 192 * 1024, - - /* - * Important NOTE! Fragment queue must be destroyed before MSL expires. - * RFC791 is wrong proposing to prolongate timer each fragment arrival - * by TTL. - */ - .timeout = IP_FRAG_TIME, - .secret_interval = 10 * 60 * HZ, -}; - static struct inet_frags ip4_frags; -int ip_frag_nqueues(void) +int ip_frag_nqueues(struct net *net) { - return ip4_frags.nqueues; + return net->ipv4.frags.nqueues; } -int ip_frag_mem(void) +int ip_frag_mem(struct net *net) { - return atomic_read(&ip4_frags.mem); + return atomic_read(&net->ipv4.frags.mem); } static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, @@ -142,11 +123,12 @@ static int ip4_frag_match(struct inet_frag_queue *q, void *a) } /* Memory Tracking Functions. */ -static __inline__ void frag_kfree_skb(struct sk_buff *skb, int *work) +static __inline__ void frag_kfree_skb(struct netns_frags *nf, + struct sk_buff *skb, int *work) { if (work) *work -= skb->truesize; - atomic_sub(skb->truesize, &ip4_frags.mem); + atomic_sub(skb->truesize, &nf->mem); kfree_skb(skb); } @@ -192,11 +174,11 @@ static void ipq_kill(struct ipq *ipq) /* Memory limiting on fragments. Evictor trashes the oldest * fragment queue until we are back under the threshold. */ -static void ip_evictor(void) +static void ip_evictor(struct net *net) { int evicted; - evicted = inet_frag_evictor(&ip4_frags); + evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags); if (evicted) IP_ADD_STATS_BH(IPSTATS_MIB_REASMFAILS, evicted); } @@ -236,7 +218,7 @@ out: /* Find the correct entry in the "incomplete datagrams" queue for * this IP datagram, and create new one, if nothing is found. */ -static inline struct ipq *ip_find(struct iphdr *iph, u32 user) +static inline struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user) { struct inet_frag_queue *q; struct ip4_create_arg arg; @@ -246,7 +228,7 @@ static inline struct ipq *ip_find(struct iphdr *iph, u32 user) arg.user = user; hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol); - q = inet_frag_find(&ip4_frags, &arg, hash); + q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash); if (q == NULL) goto out_nomem; @@ -286,7 +268,7 @@ static int ip_frag_reinit(struct ipq *qp) { struct sk_buff *fp; - if (!mod_timer(&qp->q.timer, jiffies + ip4_frags_ctl.timeout)) { + if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) { atomic_inc(&qp->q.refcnt); return -ETIMEDOUT; } @@ -294,7 +276,7 @@ static int ip_frag_reinit(struct ipq *qp) fp = qp->q.fragments; do { struct sk_buff *xp = fp->next; - frag_kfree_skb(fp, NULL); + frag_kfree_skb(qp->q.net, fp, NULL); fp = xp; } while (fp); @@ -431,7 +413,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) qp->q.fragments = next; qp->q.meat -= free_it->len; - frag_kfree_skb(free_it, NULL); + frag_kfree_skb(qp->q.net, free_it, NULL); } } @@ -451,7 +433,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) } qp->q.stamp = skb->tstamp; qp->q.meat += skb->len; - atomic_add(skb->truesize, &ip4_frags.mem); + atomic_add(skb->truesize, &qp->q.net->mem); if (offset == 0) qp->q.last_in |= FIRST_IN; @@ -459,7 +441,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) return ip_frag_reasm(qp, prev, dev); write_lock(&ip4_frags.lock); - list_move_tail(&qp->q.lru_list, &ip4_frags.lru_list); + list_move_tail(&qp->q.lru_list, &qp->q.net->lru_list); write_unlock(&ip4_frags.lock); return -EINPROGRESS; @@ -534,12 +516,12 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, head->len -= clone->len; clone->csum = 0; clone->ip_summed = head->ip_summed; - atomic_add(clone->truesize, &ip4_frags.mem); + atomic_add(clone->truesize, &qp->q.net->mem); } skb_shinfo(head)->frag_list = head->next; skb_push(head, head->data - skb_network_header(head)); - atomic_sub(head->truesize, &ip4_frags.mem); + atomic_sub(head->truesize, &qp->q.net->mem); for (fp=head->next; fp; fp = fp->next) { head->data_len += fp->len; @@ -549,7 +531,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, else if (head->ip_summed == CHECKSUM_COMPLETE) head->csum = csum_add(head->csum, fp->csum); head->truesize += fp->truesize; - atomic_sub(fp->truesize, &ip4_frags.mem); + atomic_sub(fp->truesize, &qp->q.net->mem); } head->next = NULL; @@ -582,15 +564,17 @@ out_fail: int ip_defrag(struct sk_buff *skb, u32 user) { struct ipq *qp; + struct net *net; IP_INC_STATS_BH(IPSTATS_MIB_REASMREQDS); + net = skb->dev->nd_net; /* Start by cleaning up the memory. */ - if (atomic_read(&ip4_frags.mem) > ip4_frags_ctl.high_thresh) - ip_evictor(); + if (atomic_read(&net->ipv4.frags.mem) > net->ipv4.frags.high_thresh) + ip_evictor(net); /* Lookup (or create) queue header */ - if ((qp = ip_find(ip_hdr(skb), user)) != NULL) { + if ((qp = ip_find(net, ip_hdr(skb), user)) != NULL) { int ret; spin_lock(&qp->q.lock); @@ -607,9 +591,142 @@ int ip_defrag(struct sk_buff *skb, u32 user) return -ENOMEM; } +#ifdef CONFIG_SYSCTL +static int zero; + +static struct ctl_table ip4_frags_ctl_table[] = { + { + .ctl_name = NET_IPV4_IPFRAG_HIGH_THRESH, + .procname = "ipfrag_high_thresh", + .data = &init_net.ipv4.frags.high_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_IPFRAG_LOW_THRESH, + .procname = "ipfrag_low_thresh", + .data = &init_net.ipv4.frags.low_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_IPFRAG_TIME, + .procname = "ipfrag_time", + .data = &init_net.ipv4.frags.timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies + }, + { + .ctl_name = NET_IPV4_IPFRAG_SECRET_INTERVAL, + .procname = "ipfrag_secret_interval", + .data = &ip4_frags.secret_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies + }, + { + .procname = "ipfrag_max_dist", + .data = &sysctl_ipfrag_max_dist, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .extra1 = &zero + }, + { } +}; + +static int ip4_frags_ctl_register(struct net *net) +{ + struct ctl_table *table; + struct ctl_table_header *hdr; + + table = ip4_frags_ctl_table; + if (net != &init_net) { + table = kmemdup(table, sizeof(ip4_frags_ctl_table), GFP_KERNEL); + if (table == NULL) + goto err_alloc; + + table[0].data = &net->ipv4.frags.high_thresh; + table[1].data = &net->ipv4.frags.low_thresh; + table[2].data = &net->ipv4.frags.timeout; + table[3].mode &= ~0222; + table[4].mode &= ~0222; + } + + hdr = register_net_sysctl_table(net, net_ipv4_ctl_path, table); + if (hdr == NULL) + goto err_reg; + + net->ipv4.frags_hdr = hdr; + return 0; + +err_reg: + if (net != &init_net) + kfree(table); +err_alloc: + return -ENOMEM; +} + +static void ip4_frags_ctl_unregister(struct net *net) +{ + struct ctl_table *table; + + table = net->ipv4.frags_hdr->ctl_table_arg; + unregister_net_sysctl_table(net->ipv4.frags_hdr); + kfree(table); +} +#else +static inline int ip4_frags_ctl_register(struct net *net) +{ + return 0; +} + +static inline void ip4_frags_ctl_unregister(struct net *net) +{ +} +#endif + +static int ipv4_frags_init_net(struct net *net) +{ + /* + * Fragment cache limits. We will commit 256K at one time. Should we + * cross that limit we will prune down to 192K. This should cope with + * even the most extreme cases without allowing an attacker to + * measurably harm machine performance. + */ + net->ipv4.frags.high_thresh = 256 * 1024; + net->ipv4.frags.low_thresh = 192 * 1024; + /* + * Important NOTE! Fragment queue must be destroyed before MSL expires. + * RFC791 is wrong proposing to prolongate timer each fragment arrival + * by TTL. + */ + net->ipv4.frags.timeout = IP_FRAG_TIME; + + inet_frags_init_net(&net->ipv4.frags); + + return ip4_frags_ctl_register(net); +} + +static void ipv4_frags_exit_net(struct net *net) +{ + ip4_frags_ctl_unregister(net); + inet_frags_exit_net(&net->ipv4.frags, &ip4_frags); +} + +static struct pernet_operations ip4_frags_ops = { + .init = ipv4_frags_init_net, + .exit = ipv4_frags_exit_net, +}; + void __init ipfrag_init(void) { - ip4_frags.ctl = &ip4_frags_ctl; + register_pernet_subsys(&ip4_frags_ops); ip4_frags.hashfn = ip4_hashfn; ip4_frags.constructor = ip4_frag_init; ip4_frags.destructor = ip4_frag_free; @@ -617,6 +734,7 @@ void __init ipfrag_init(void) ip4_frags.qsize = sizeof(struct ipq); ip4_frags.match = ip4_frag_match; ip4_frags.frag_expire = ip_expire; + ip4_frags.secret_interval = 10 * 60 * HZ; inet_frags_init(&ip4_frags); } diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 4b93f32de10d..63f691719353 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -176,7 +176,8 @@ static struct ip_tunnel * ipgre_tunnel_lookup(__be32 remote, __be32 local, __be3 } for (t = tunnels_l[h1]; t; t = t->next) { if (local == t->parms.iph.saddr || - (local == t->parms.iph.daddr && MULTICAST(local))) { + (local == t->parms.iph.daddr && + ipv4_is_multicast(local))) { if (t->parms.i_key == key && (t->dev->flags&IFF_UP)) return t; } @@ -201,7 +202,7 @@ static struct ip_tunnel **__ipgre_bucket(struct ip_tunnel_parm *parms) if (local) prio |= 1; - if (remote && !MULTICAST(remote)) { + if (remote && !ipv4_is_multicast(remote)) { prio |= 2; h ^= HASH(remote); } @@ -367,7 +368,8 @@ static void ipgre_err(struct sk_buff *skb, u32 info) read_lock(&ipgre_lock); t = ipgre_tunnel_lookup(iph->daddr, iph->saddr, (flags&GRE_KEY) ? *(((__be32*)p) + (grehlen>>2) - 1) : 0); - if (t == NULL || t->parms.iph.daddr == 0 || MULTICAST(t->parms.iph.daddr)) + if (t == NULL || t->parms.iph.daddr == 0 || + ipv4_is_multicast(t->parms.iph.daddr)) goto out; if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) @@ -478,7 +480,7 @@ out: fl.fl4_dst = eiph->saddr; fl.fl4_tos = RT_TOS(eiph->tos); fl.proto = IPPROTO_GRE; - if (ip_route_output_key(&rt, &fl)) { + if (ip_route_output_key(&init_net, &rt, &fl)) { kfree_skb(skb2); return; } @@ -491,7 +493,7 @@ out: fl.fl4_dst = eiph->daddr; fl.fl4_src = eiph->saddr; fl.fl4_tos = eiph->tos; - if (ip_route_output_key(&rt, &fl) || + if (ip_route_output_key(&init_net, &rt, &fl) || rt->u.dst.dev->type != ARPHRD_IPGRE) { ip_rt_put(rt); kfree_skb(skb2); @@ -619,7 +621,7 @@ static int ipgre_rcv(struct sk_buff *skb) skb_postpull_rcsum(skb, skb_transport_header(skb), offset); skb->pkt_type = PACKET_HOST; #ifdef CONFIG_NET_IPGRE_BROADCAST - if (MULTICAST(iph->daddr)) { + if (ipv4_is_multicast(iph->daddr)) { /* Looped back packet, drop it! */ if (((struct rtable*)skb->dst)->fl.iif == 0) goto drop; @@ -746,7 +748,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) .saddr = tiph->saddr, .tos = RT_TOS(tos) } }, .proto = IPPROTO_GRE }; - if (ip_route_output_key(&rt, &fl)) { + if (ip_route_output_key(&init_net, &rt, &fl)) { tunnel->stat.tx_carrier_errors++; goto tx_error; } @@ -783,7 +785,8 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) struct rt6_info *rt6 = (struct rt6_info*)skb->dst; if (rt6 && mtu < dst_mtu(skb->dst) && mtu >= IPV6_MIN_MTU) { - if ((tunnel->parms.iph.daddr && !MULTICAST(tunnel->parms.iph.daddr)) || + if ((tunnel->parms.iph.daddr && + !ipv4_is_multicast(tunnel->parms.iph.daddr)) || rt6->rt6i_dst.plen == 128) { rt6->rt6i_flags |= RTF_MODIFIED; skb->dst->metrics[RTAX_MTU-1] = mtu; @@ -896,6 +899,59 @@ tx_error: return 0; } +static void ipgre_tunnel_bind_dev(struct net_device *dev) +{ + struct net_device *tdev = NULL; + struct ip_tunnel *tunnel; + struct iphdr *iph; + int hlen = LL_MAX_HEADER; + int mtu = ETH_DATA_LEN; + int addend = sizeof(struct iphdr) + 4; + + tunnel = netdev_priv(dev); + iph = &tunnel->parms.iph; + + /* Guess output device to choose reasonable mtu and hard_header_len */ + + if (iph->daddr) { + struct flowi fl = { .oif = tunnel->parms.link, + .nl_u = { .ip4_u = + { .daddr = iph->daddr, + .saddr = iph->saddr, + .tos = RT_TOS(iph->tos) } }, + .proto = IPPROTO_GRE }; + struct rtable *rt; + if (!ip_route_output_key(&init_net, &rt, &fl)) { + tdev = rt->u.dst.dev; + ip_rt_put(rt); + } + dev->flags |= IFF_POINTOPOINT; + } + + if (!tdev && tunnel->parms.link) + tdev = __dev_get_by_index(&init_net, tunnel->parms.link); + + if (tdev) { + hlen = tdev->hard_header_len; + mtu = tdev->mtu; + } + dev->iflink = tunnel->parms.link; + + /* Precalculate GRE options length */ + if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) { + if (tunnel->parms.o_flags&GRE_CSUM) + addend += 4; + if (tunnel->parms.o_flags&GRE_KEY) + addend += 4; + if (tunnel->parms.o_flags&GRE_SEQ) + addend += 4; + } + dev->hard_header_len = hlen + addend; + dev->mtu = mtu - addend; + tunnel->hlen = addend; + +} + static int ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) { @@ -956,7 +1012,7 @@ ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) t = netdev_priv(dev); - if (MULTICAST(p.iph.daddr)) + if (ipv4_is_multicast(p.iph.daddr)) nflags = IFF_BROADCAST; else if (p.iph.daddr) nflags = IFF_POINTOPOINT; @@ -983,6 +1039,11 @@ ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) t->parms.iph.ttl = p.iph.ttl; t->parms.iph.tos = p.iph.tos; t->parms.iph.frag_off = p.iph.frag_off; + if (t->parms.link != p.link) { + t->parms.link = p.link; + ipgre_tunnel_bind_dev(dev); + netdev_state_change(dev); + } } if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p))) err = -EFAULT; @@ -1085,7 +1146,7 @@ static int ipgre_header(struct sk_buff *skb, struct net_device *dev, memcpy(&iph->daddr, daddr, 4); return t->hlen; } - if (iph->daddr && !MULTICAST(iph->daddr)) + if (iph->daddr && !ipv4_is_multicast(iph->daddr)) return t->hlen; return -t->hlen; @@ -1108,7 +1169,7 @@ static int ipgre_open(struct net_device *dev) { struct ip_tunnel *t = netdev_priv(dev); - if (MULTICAST(t->parms.iph.daddr)) { + if (ipv4_is_multicast(t->parms.iph.daddr)) { struct flowi fl = { .oif = t->parms.link, .nl_u = { .ip4_u = { .daddr = t->parms.iph.daddr, @@ -1116,7 +1177,7 @@ static int ipgre_open(struct net_device *dev) .tos = RT_TOS(t->parms.iph.tos) } }, .proto = IPPROTO_GRE }; struct rtable *rt; - if (ip_route_output_key(&rt, &fl)) + if (ip_route_output_key(&init_net, &rt, &fl)) return -EADDRNOTAVAIL; dev = rt->u.dst.dev; ip_rt_put(rt); @@ -1131,8 +1192,9 @@ static int ipgre_open(struct net_device *dev) static int ipgre_close(struct net_device *dev) { struct ip_tunnel *t = netdev_priv(dev); - if (MULTICAST(t->parms.iph.daddr) && t->mlink) { - struct in_device *in_dev = inetdev_by_index(t->mlink); + if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) { + struct in_device *in_dev; + in_dev = inetdev_by_index(dev->nd_net, t->mlink); if (in_dev) { ip_mc_dec_group(in_dev, t->parms.iph.daddr); in_dev_put(in_dev); @@ -1162,12 +1224,8 @@ static void ipgre_tunnel_setup(struct net_device *dev) static int ipgre_tunnel_init(struct net_device *dev) { - struct net_device *tdev = NULL; struct ip_tunnel *tunnel; struct iphdr *iph; - int hlen = LL_MAX_HEADER; - int mtu = ETH_DATA_LEN; - int addend = sizeof(struct iphdr) + 4; tunnel = netdev_priv(dev); iph = &tunnel->parms.iph; @@ -1178,25 +1236,11 @@ static int ipgre_tunnel_init(struct net_device *dev) memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4); memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); - /* Guess output device to choose reasonable mtu and hard_header_len */ + ipgre_tunnel_bind_dev(dev); if (iph->daddr) { - struct flowi fl = { .oif = tunnel->parms.link, - .nl_u = { .ip4_u = - { .daddr = iph->daddr, - .saddr = iph->saddr, - .tos = RT_TOS(iph->tos) } }, - .proto = IPPROTO_GRE }; - struct rtable *rt; - if (!ip_route_output_key(&rt, &fl)) { - tdev = rt->u.dst.dev; - ip_rt_put(rt); - } - - dev->flags |= IFF_POINTOPOINT; - #ifdef CONFIG_NET_IPGRE_BROADCAST - if (MULTICAST(iph->daddr)) { + if (ipv4_is_multicast(iph->daddr)) { if (!iph->saddr) return -EINVAL; dev->flags = IFF_BROADCAST; @@ -1205,31 +1249,9 @@ static int ipgre_tunnel_init(struct net_device *dev) dev->stop = ipgre_close; } #endif - } else { + } else dev->header_ops = &ipgre_header_ops; - } - - if (!tdev && tunnel->parms.link) - tdev = __dev_get_by_index(&init_net, tunnel->parms.link); - - if (tdev) { - hlen = tdev->hard_header_len; - mtu = tdev->mtu; - } - dev->iflink = tunnel->parms.link; - /* Precalculate GRE options length */ - if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) { - if (tunnel->parms.o_flags&GRE_CSUM) - addend += 4; - if (tunnel->parms.o_flags&GRE_KEY) - addend += 4; - if (tunnel->parms.o_flags&GRE_SEQ) - addend += 4; - } - dev->hard_header_len = hlen + addend; - dev->mtu = mtu - addend; - tunnel->hlen = addend; return 0; } diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 168c871fcd79..65631391d479 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -204,22 +204,14 @@ static int ip_local_deliver_finish(struct sk_buff *skb) rcu_read_lock(); { - /* Note: See raw.c and net/raw.h, RAWV4_HTABLE_SIZE==MAX_INET_PROTOS */ int protocol = ip_hdr(skb)->protocol; - int hash; - struct sock *raw_sk; + int hash, raw; struct net_protocol *ipprot; resubmit: - hash = protocol & (MAX_INET_PROTOS - 1); - raw_sk = sk_head(&raw_v4_htable[hash]); - - /* If there maybe a raw socket we must check - if not we - * don't care less - */ - if (raw_sk && !raw_v4_input(skb, ip_hdr(skb), hash)) - raw_sk = NULL; + raw = raw_local_deliver(skb, protocol); + hash = protocol & (MAX_INET_PROTOS - 1); if ((ipprot = rcu_dereference(inet_protos[hash])) != NULL) { int ret; @@ -237,7 +229,7 @@ static int ip_local_deliver_finish(struct sk_buff *skb) } IP_INC_STATS_BH(IPSTATS_MIB_INDELIVERS); } else { - if (!raw_sk) { + if (!raw) { if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { IP_INC_STATS_BH(IPSTATS_MIB_INUNKNOWNPROTOS); icmp_send(skb, ICMP_DEST_UNREACH, @@ -268,7 +260,7 @@ int ip_local_deliver(struct sk_buff *skb) return 0; } - return NF_HOOK(PF_INET, NF_IP_LOCAL_IN, skb, skb->dev, NULL, + return NF_HOOK(PF_INET, NF_INET_LOCAL_IN, skb, skb->dev, NULL, ip_local_deliver_finish); } @@ -347,7 +339,7 @@ static int ip_rcv_finish(struct sk_buff *skb) #ifdef CONFIG_NET_CLS_ROUTE if (unlikely(skb->dst->tclassid)) { - struct ip_rt_acct *st = ip_rt_acct + 256*smp_processor_id(); + struct ip_rt_acct *st = per_cpu_ptr(ip_rt_acct, smp_processor_id()); u32 idx = skb->dst->tclassid; st[idx&0xFF].o_packets++; st[idx&0xFF].o_bytes+=skb->len; @@ -442,7 +434,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, /* Remove any debris in the socket control block */ memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); - return NF_HOOK(PF_INET, NF_IP_PRE_ROUTING, skb, dev, NULL, + return NF_HOOK(PF_INET, NF_INET_PRE_ROUTING, skb, dev, NULL, ip_rcv_finish); inhdr_error: diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 2f14745a9e1f..4d315158fd3c 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -151,7 +151,7 @@ int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb) __be32 addr; memcpy(&addr, sptr+soffset-1, 4); - if (inet_addr_type(addr) != RTN_LOCAL) { + if (inet_addr_type(&init_net, addr) != RTN_LOCAL) { dopt->ts_needtime = 1; soffset += 8; } @@ -400,7 +400,7 @@ int ip_options_compile(struct ip_options * opt, struct sk_buff * skb) { __be32 addr; memcpy(&addr, &optptr[optptr[2]-1], 4); - if (inet_addr_type(addr) == RTN_UNICAST) + if (inet_addr_type(&init_net, addr) == RTN_UNICAST) break; if (skb) timeptr = (__be32*)&optptr[optptr[2]+3]; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index bc9e57550e86..18070ca65771 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -91,6 +91,28 @@ __inline__ void ip_send_check(struct iphdr *iph) iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); } +int __ip_local_out(struct sk_buff *skb) +{ + struct iphdr *iph = ip_hdr(skb); + + iph->tot_len = htons(skb->len); + ip_send_check(iph); + return nf_hook(PF_INET, NF_INET_LOCAL_OUT, skb, NULL, skb->dst->dev, + dst_output); +} + +int ip_local_out(struct sk_buff *skb) +{ + int err; + + err = __ip_local_out(skb); + if (likely(err == 1)) + err = dst_output(skb); + + return err; +} +EXPORT_SYMBOL_GPL(ip_local_out); + /* dev_loopback_xmit for use with netfilter. */ static int ip_dev_loopback_xmit(struct sk_buff *newskb) { @@ -138,20 +160,17 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, iph->daddr = rt->rt_dst; iph->saddr = rt->rt_src; iph->protocol = sk->sk_protocol; - iph->tot_len = htons(skb->len); ip_select_ident(iph, &rt->u.dst, sk); if (opt && opt->optlen) { iph->ihl += opt->optlen>>2; ip_options_build(skb, opt, daddr, rt, 0); } - ip_send_check(iph); skb->priority = sk->sk_priority; /* Send it out. */ - return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev, - dst_output); + return ip_local_out(skb); } EXPORT_SYMBOL_GPL(ip_build_and_send_pkt); @@ -251,8 +270,8 @@ int ip_mc_output(struct sk_buff *skb) ) { struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); if (newskb) - NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL, - newskb->dev, + NF_HOOK(PF_INET, NF_INET_POST_ROUTING, newskb, + NULL, newskb->dev, ip_dev_loopback_xmit); } @@ -267,11 +286,11 @@ int ip_mc_output(struct sk_buff *skb) if (rt->rt_flags&RTCF_BROADCAST) { struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); if (newskb) - NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL, + NF_HOOK(PF_INET, NF_INET_POST_ROUTING, newskb, NULL, newskb->dev, ip_dev_loopback_xmit); } - return NF_HOOK_COND(PF_INET, NF_IP_POST_ROUTING, skb, NULL, skb->dev, + return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb, NULL, skb->dev, ip_finish_output, !(IPCB(skb)->flags & IPSKB_REROUTED)); } @@ -285,7 +304,7 @@ int ip_output(struct sk_buff *skb) skb->dev = dev; skb->protocol = htons(ETH_P_IP); - return NF_HOOK_COND(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev, + return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb, NULL, dev, ip_finish_output, !(IPCB(skb)->flags & IPSKB_REROUTED)); } @@ -331,7 +350,7 @@ int ip_queue_xmit(struct sk_buff *skb, int ipfragok) * itself out. */ security_sk_classify_flow(sk, &fl); - if (ip_route_output_flow(&rt, &fl, sk, 0)) + if (ip_route_output_flow(&init_net, &rt, &fl, sk, 0)) goto no_route; } sk_setup_caps(sk, &rt->u.dst); @@ -347,7 +366,6 @@ packet_routed: skb_reset_network_header(skb); iph = ip_hdr(skb); *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff)); - iph->tot_len = htons(skb->len); if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok) iph->frag_off = htons(IP_DF); else @@ -366,13 +384,9 @@ packet_routed: ip_select_ident_more(iph, &rt->u.dst, sk, (skb_shinfo(skb)->gso_segs ?: 1) - 1); - /* Add an IP checksum. */ - ip_send_check(iph); - skb->priority = sk->sk_priority; - return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev, - dst_output); + return ip_local_out(skb); no_route: IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES); @@ -1262,14 +1276,12 @@ int ip_push_pending_frames(struct sock *sk) ip_options_build(skb, opt, inet->cork.addr, rt, 0); } iph->tos = inet->tos; - iph->tot_len = htons(skb->len); iph->frag_off = df; ip_select_ident(iph, &rt->u.dst, sk); iph->ttl = ttl; iph->protocol = sk->sk_protocol; iph->saddr = rt->rt_src; iph->daddr = rt->rt_dst; - ip_send_check(iph); skb->priority = sk->sk_priority; skb->dst = dst_clone(&rt->u.dst); @@ -1279,8 +1291,7 @@ int ip_push_pending_frames(struct sock *sk) skb_transport_header(skb))->type); /* Netfilter gets whole the not fragmented skb. */ - err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, - skb->dst->dev, dst_output); + err = ip_local_out(skb); if (err) { if (err > 0) err = inet->recverr ? net_xmit_errno(err) : 0; @@ -1330,8 +1341,6 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset, * * Should run single threaded per socket because it uses the sock * structure to pass arguments. - * - * LATER: switch from ip_build_xmit to ip_append_* */ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg, unsigned int len) @@ -1370,7 +1379,7 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar .dport = tcp_hdr(skb)->source } }, .proto = sk->sk_protocol }; security_skb_classify_flow(skb, &fl); - if (ip_route_output_key(&rt, &fl)) + if (ip_route_output_key(sk->sk_net, &rt, &fl)) return; } diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 82817e554363..754b0a5bbfe9 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -594,7 +594,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, err = 0; break; } - dev = ip_dev_find(mreq.imr_address.s_addr); + dev = ip_dev_find(&init_net, mreq.imr_address.s_addr); if (dev) { mreq.imr_ifindex = dev->ifindex; dev_put(dev); diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index 2c44a94c2135..f4af99ad8fdb 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -182,7 +182,6 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info) static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x) { struct xfrm_state *t; - u8 mode = XFRM_MODE_TUNNEL; t = xfrm_state_alloc(); if (t == NULL) @@ -193,9 +192,7 @@ static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x) t->id.daddr.a4 = x->id.daddr.a4; memcpy(&t->sel, &x->sel, sizeof(t->sel)); t->props.family = AF_INET; - if (x->props.mode == XFRM_MODE_BEET) - mode = x->props.mode; - t->props.mode = mode; + t->props.mode = x->props.mode; t->props.saddr.a4 = x->props.saddr.a4; t->props.flags = x->props.flags; @@ -389,15 +386,22 @@ static int ipcomp_init_state(struct xfrm_state *x) if (x->encap) goto out; + x->props.header_len = 0; + switch (x->props.mode) { + case XFRM_MODE_TRANSPORT: + break; + case XFRM_MODE_TUNNEL: + x->props.header_len += sizeof(struct iphdr); + break; + default: + goto out; + } + err = -ENOMEM; ipcd = kzalloc(sizeof(*ipcd), GFP_KERNEL); if (!ipcd) goto out; - x->props.header_len = 0; - if (x->props.mode == XFRM_MODE_TUNNEL) - x->props.header_len += sizeof(struct iphdr); - mutex_lock(&ipcomp_resource_mutex); if (!ipcomp_alloc_scratches()) goto error; diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index b8f7763b2261..a52b5853aaa8 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -140,6 +140,9 @@ __be32 ic_servaddr = NONE; /* Boot server IP address */ __be32 root_server_addr = NONE; /* Address of NFS server */ u8 root_server_path[256] = { 0, }; /* Path to mount as root */ +/* vendor class identifier */ +static char vendor_class_identifier[253] __initdata; + /* Persistent data: */ static int ic_proto_used; /* Protocol used, if any */ @@ -299,7 +302,7 @@ static int __init ic_route_ioctl(unsigned int cmd, struct rtentry *arg) mm_segment_t oldfs = get_fs(); set_fs(get_ds()); - res = ip_rt_ioctl(cmd, (void __user *) arg); + res = ip_rt_ioctl(&init_net, cmd, (void __user *) arg); set_fs(oldfs); return res; } @@ -588,6 +591,7 @@ ic_dhcp_init_options(u8 *options) u8 mt = ((ic_servaddr == NONE) ? DHCPDISCOVER : DHCPREQUEST); u8 *e = options; + int len; #ifdef IPCONFIG_DEBUG printk("DHCP: Sending message type %d\n", mt); @@ -628,6 +632,16 @@ ic_dhcp_init_options(u8 *options) *e++ = sizeof(ic_req_params); memcpy(e, ic_req_params, sizeof(ic_req_params)); e += sizeof(ic_req_params); + + if (*vendor_class_identifier) { + printk(KERN_INFO "DHCP: sending class identifier \"%s\"\n", + vendor_class_identifier); + *e++ = 60; /* Class-identifier */ + len = strlen(vendor_class_identifier); + *e++ = len; + memcpy(e, vendor_class_identifier, len); + e += len; + } } *e++ = 255; /* End of the list */ @@ -1513,5 +1527,16 @@ static int __init nfsaddrs_config_setup(char *addrs) return ip_auto_config_setup(addrs); } +static int __init vendor_class_identifier_setup(char *addrs) +{ + if (strlcpy(vendor_class_identifier, addrs, + sizeof(vendor_class_identifier)) + >= sizeof(vendor_class_identifier)) + printk(KERN_WARNING "DHCP: vendorclass too long, truncated to \"%s\"", + vendor_class_identifier); + return 1; +} + __setup("ip=", ip_auto_config_setup); __setup("nfsaddrs=", nfsaddrs_config_setup); +__setup("dhcpclass=", vendor_class_identifier_setup); diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 8c2b2b0741da..da281581692c 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -405,7 +405,7 @@ out: fl.fl4_daddr = eiph->saddr; fl.fl4_tos = RT_TOS(eiph->tos); fl.proto = IPPROTO_IPIP; - if (ip_route_output_key(&rt, &key)) { + if (ip_route_output_key(&init_net, &rt, &key)) { kfree_skb(skb2); return 0; } @@ -418,7 +418,7 @@ out: fl.fl4_daddr = eiph->daddr; fl.fl4_src = eiph->saddr; fl.fl4_tos = eiph->tos; - if (ip_route_output_key(&rt, &fl) || + if (ip_route_output_key(&init_net, &rt, &fl) || rt->u.dst.dev->type != ARPHRD_TUNNEL) { ip_rt_put(rt); kfree_skb(skb2); @@ -547,7 +547,7 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) .saddr = tiph->saddr, .tos = RT_TOS(tos) } }, .proto = IPPROTO_IPIP }; - if (ip_route_output_key(&rt, &fl)) { + if (ip_route_output_key(&init_net, &rt, &fl)) { tunnel->stat.tx_carrier_errors++; goto tx_error_icmp; } @@ -651,6 +651,40 @@ tx_error: return 0; } +static void ipip_tunnel_bind_dev(struct net_device *dev) +{ + struct net_device *tdev = NULL; + struct ip_tunnel *tunnel; + struct iphdr *iph; + + tunnel = netdev_priv(dev); + iph = &tunnel->parms.iph; + + if (iph->daddr) { + struct flowi fl = { .oif = tunnel->parms.link, + .nl_u = { .ip4_u = + { .daddr = iph->daddr, + .saddr = iph->saddr, + .tos = RT_TOS(iph->tos) } }, + .proto = IPPROTO_IPIP }; + struct rtable *rt; + if (!ip_route_output_key(&init_net, &rt, &fl)) { + tdev = rt->u.dst.dev; + ip_rt_put(rt); + } + dev->flags |= IFF_POINTOPOINT; + } + + if (!tdev && tunnel->parms.link) + tdev = __dev_get_by_index(&init_net, tunnel->parms.link); + + if (tdev) { + dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr); + dev->mtu = tdev->mtu - sizeof(struct iphdr); + } + dev->iflink = tunnel->parms.link; +} + static int ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) { @@ -723,6 +757,11 @@ ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) t->parms.iph.ttl = p.iph.ttl; t->parms.iph.tos = p.iph.tos; t->parms.iph.frag_off = p.iph.frag_off; + if (t->parms.link != p.link) { + t->parms.link = p.link; + ipip_tunnel_bind_dev(dev); + netdev_state_change(dev); + } } if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p))) err = -EFAULT; @@ -791,12 +830,9 @@ static void ipip_tunnel_setup(struct net_device *dev) static int ipip_tunnel_init(struct net_device *dev) { - struct net_device *tdev = NULL; struct ip_tunnel *tunnel; - struct iphdr *iph; tunnel = netdev_priv(dev); - iph = &tunnel->parms.iph; tunnel->dev = dev; strcpy(tunnel->parms.name, dev->name); @@ -804,29 +840,7 @@ static int ipip_tunnel_init(struct net_device *dev) memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4); memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); - if (iph->daddr) { - struct flowi fl = { .oif = tunnel->parms.link, - .nl_u = { .ip4_u = - { .daddr = iph->daddr, - .saddr = iph->saddr, - .tos = RT_TOS(iph->tos) } }, - .proto = IPPROTO_IPIP }; - struct rtable *rt; - if (!ip_route_output_key(&rt, &fl)) { - tdev = rt->u.dst.dev; - ip_rt_put(rt); - } - dev->flags |= IFF_POINTOPOINT; - } - - if (!tdev && tunnel->parms.link) - tdev = __dev_get_by_index(&init_net, tunnel->parms.link); - - if (tdev) { - dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr); - dev->mtu = tdev->mtu - sizeof(struct iphdr); - } - dev->iflink = tunnel->parms.link; + ipip_tunnel_bind_dev(dev); return 0; } diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 37bb497d92af..a94f52c207a7 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -141,7 +141,7 @@ struct net_device *ipmr_new_tunnel(struct vifctl *v) p.iph.ihl = 5; p.iph.protocol = IPPROTO_IPIP; sprintf(p.name, "dvmrp%d", v->vifc_vifi); - ifr.ifr_ifru.ifru_data = (void*)&p; + ifr.ifr_ifru.ifru_data = (__force void __user *)&p; oldfs = get_fs(); set_fs(KERNEL_DS); err = dev->do_ioctl(dev, &ifr, SIOCADDTUNNEL); @@ -321,7 +321,7 @@ static void ipmr_destroy_unres(struct mfc_cache *c) e->error = -ETIMEDOUT; memset(&e->msg, 0, sizeof(e->msg)); - rtnl_unicast(skb, NETLINK_CB(skb).pid); + rtnl_unicast(skb, &init_net, NETLINK_CB(skb).pid); } else kfree_skb(skb); } @@ -423,7 +423,7 @@ static int vif_add(struct vifctl *vifc, int mrtsock) return -ENOBUFS; break; case 0: - dev = ip_dev_find(vifc->vifc_lcl_addr.s_addr); + dev = ip_dev_find(&init_net, vifc->vifc_lcl_addr.s_addr); if (!dev) return -EADDRNOTAVAIL; dev_put(dev); @@ -533,7 +533,7 @@ static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c) memset(&e->msg, 0, sizeof(e->msg)); } - rtnl_unicast(skb, NETLINK_CB(skb).pid); + rtnl_unicast(skb, &init_net, NETLINK_CB(skb).pid); } else ip_mr_forward(skb, c, 0); } @@ -749,7 +749,7 @@ static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock) return 0; } - if (!MULTICAST(mfc->mfcc_mcastgrp.s_addr)) + if (!ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr)) return -EINVAL; c=ipmr_cache_alloc(); @@ -849,7 +849,7 @@ static void mrtsock_destruct(struct sock *sk) { rtnl_lock(); if (sk == mroute_socket) { - IPV4_DEVCONF_ALL(MC_FORWARDING)--; + IPV4_DEVCONF_ALL(sk->sk_net, MC_FORWARDING)--; write_lock_bh(&mrt_lock); mroute_socket=NULL; @@ -898,7 +898,7 @@ int ip_mroute_setsockopt(struct sock *sk,int optname,char __user *optval,int opt mroute_socket=sk; write_unlock_bh(&mrt_lock); - IPV4_DEVCONF_ALL(MC_FORWARDING)++; + IPV4_DEVCONF_ALL(sk->sk_net, MC_FORWARDING)++; } rtnl_unlock(); return ret; @@ -954,10 +954,12 @@ int ip_mroute_setsockopt(struct sock *sk,int optname,char __user *optval,int opt #ifdef CONFIG_IP_PIMSM case MRT_PIM: { - int v, ret; + int v; + if (get_user(v,(int __user *)optval)) return -EFAULT; - v = (v)?1:0; + v = (v) ? 1 : 0; + rtnl_lock(); ret = 0; if (v != mroute_do_pim) { @@ -1183,7 +1185,7 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi) .saddr = vif->local, .tos = RT_TOS(iph->tos) } }, .proto = IPPROTO_IPIP }; - if (ip_route_output_key(&rt, &fl)) + if (ip_route_output_key(&init_net, &rt, &fl)) goto out_free; encap = sizeof(struct iphdr); } else { @@ -1192,7 +1194,7 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi) { .daddr = iph->daddr, .tos = RT_TOS(iph->tos) } }, .proto = IPPROTO_IPIP }; - if (ip_route_output_key(&rt, &fl)) + if (ip_route_output_key(&init_net, &rt, &fl)) goto out_free; } @@ -1245,7 +1247,7 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi) * not mrouter) cannot join to more than one interface - it will * result in receiving multiple packets. */ - NF_HOOK(PF_INET, NF_IP_FORWARD, skb, skb->dev, dev, + NF_HOOK(PF_INET, NF_INET_FORWARD, skb, skb->dev, dev, ipmr_forward_finish); return; @@ -1461,7 +1463,7 @@ int pim_rcv_v1(struct sk_buff * skb) b. packet is not a NULL-REGISTER c. packet is not truncated */ - if (!MULTICAST(encap->daddr) || + if (!ipv4_is_multicast(encap->daddr) || encap->tot_len == 0 || ntohs(encap->tot_len) + sizeof(*pim) > skb->len) goto drop; @@ -1517,7 +1519,7 @@ static int pim_rcv(struct sk_buff * skb) /* check if the inner packet is destined to mcast group */ encap = (struct iphdr *)(skb_transport_header(skb) + sizeof(struct pimreghdr)); - if (!MULTICAST(encap->daddr) || + if (!ipv4_is_multicast(encap->daddr) || encap->tot_len == 0 || ntohs(encap->tot_len) + sizeof(*pim) > skb->len) goto drop; @@ -1659,6 +1661,7 @@ static struct vif_device *ipmr_vif_seq_idx(struct ipmr_vif_iter *iter, } static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(mrt_lock) { read_lock(&mrt_lock); return *pos ? ipmr_vif_seq_idx(seq->private, *pos - 1) @@ -1682,6 +1685,7 @@ static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos) } static void ipmr_vif_seq_stop(struct seq_file *seq, void *v) + __releases(mrt_lock) { read_unlock(&mrt_lock); } @@ -1889,8 +1893,7 @@ void __init ip_mr_init(void) sizeof(struct mfc_cache), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); - init_timer(&ipmr_expire_timer); - ipmr_expire_timer.function=ipmr_expire_process; + setup_timer(&ipmr_expire_timer, ipmr_expire_process, 0); register_netdevice_notifier(&ip_mr_notifier); #ifdef CONFIG_PROC_FS proc_net_fops_create(&init_net, "ip_mr_vif", 0, &ipmr_vif_fops); diff --git a/net/ipv4/ipvs/ip_vs_app.c b/net/ipv4/ipvs/ip_vs_app.c index 664cb8e97c1c..535abe0c45e7 100644 --- a/net/ipv4/ipvs/ip_vs_app.c +++ b/net/ipv4/ipvs/ip_vs_app.c @@ -51,18 +51,13 @@ static DEFINE_MUTEX(__ip_vs_app_mutex); */ static inline int ip_vs_app_get(struct ip_vs_app *app) { - /* test and get the module atomically */ - if (app->module) - return try_module_get(app->module); - else - return 1; + return try_module_get(app->module); } static inline void ip_vs_app_put(struct ip_vs_app *app) { - if (app->module) - module_put(app->module); + module_put(app->module); } diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c index 0a9f3c37e18d..65f1ba112752 100644 --- a/net/ipv4/ipvs/ip_vs_conn.c +++ b/net/ipv4/ipvs/ip_vs_conn.c @@ -393,7 +393,15 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) atomic_inc(&dest->refcnt); /* Bind with the destination and its corresponding transmitter */ - cp->flags |= atomic_read(&dest->conn_flags); + if ((cp->flags & IP_VS_CONN_F_SYNC) && + (!(cp->flags & IP_VS_CONN_F_TEMPLATE))) + /* if the connection is not template and is created + * by sync, preserve the activity flag. + */ + cp->flags |= atomic_read(&dest->conn_flags) & + (~IP_VS_CONN_F_INACTIVE); + else + cp->flags |= atomic_read(&dest->conn_flags); cp->dest = dest; IP_VS_DBG(7, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " @@ -412,7 +420,11 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) /* It is a normal connection, so increase the inactive connection counter because it is in TCP SYNRECV state (inactive) or other protocol inacive state */ - atomic_inc(&dest->inactconns); + if ((cp->flags & IP_VS_CONN_F_SYNC) && + (!(cp->flags & IP_VS_CONN_F_INACTIVE))) + atomic_inc(&dest->activeconns); + else + atomic_inc(&dest->inactconns); } else { /* It is a persistent connection/template, so increase the peristent connection counter */ @@ -629,9 +641,7 @@ ip_vs_conn_new(int proto, __be32 caddr, __be16 cport, __be32 vaddr, __be16 vport } INIT_LIST_HEAD(&cp->c_list); - init_timer(&cp->timer); - cp->timer.data = (unsigned long)cp; - cp->timer.function = ip_vs_conn_expire; + setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp); cp->protocol = proto; cp->caddr = caddr; cp->cport = cport; @@ -783,6 +793,57 @@ static const struct file_operations ip_vs_conn_fops = { .llseek = seq_lseek, .release = seq_release, }; + +static const char *ip_vs_origin_name(unsigned flags) +{ + if (flags & IP_VS_CONN_F_SYNC) + return "SYNC"; + else + return "LOCAL"; +} + +static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v) +{ + + if (v == SEQ_START_TOKEN) + seq_puts(seq, + "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Origin Expires\n"); + else { + const struct ip_vs_conn *cp = v; + + seq_printf(seq, + "%-3s %08X %04X %08X %04X %08X %04X %-11s %-6s %7lu\n", + ip_vs_proto_name(cp->protocol), + ntohl(cp->caddr), ntohs(cp->cport), + ntohl(cp->vaddr), ntohs(cp->vport), + ntohl(cp->daddr), ntohs(cp->dport), + ip_vs_state_name(cp->protocol, cp->state), + ip_vs_origin_name(cp->flags), + (cp->timer.expires-jiffies)/HZ); + } + return 0; +} + +static const struct seq_operations ip_vs_conn_sync_seq_ops = { + .start = ip_vs_conn_seq_start, + .next = ip_vs_conn_seq_next, + .stop = ip_vs_conn_seq_stop, + .show = ip_vs_conn_sync_seq_show, +}; + +static int ip_vs_conn_sync_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &ip_vs_conn_sync_seq_ops); +} + +static const struct file_operations ip_vs_conn_sync_fops = { + .owner = THIS_MODULE, + .open = ip_vs_conn_sync_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + #endif @@ -942,6 +1003,7 @@ int ip_vs_conn_init(void) } proc_net_fops_create(&init_net, "ip_vs_conn", 0, &ip_vs_conn_fops); + proc_net_fops_create(&init_net, "ip_vs_conn_sync", 0, &ip_vs_conn_sync_fops); /* calculate the random value for connection hash */ get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd)); @@ -958,5 +1020,6 @@ void ip_vs_conn_cleanup(void) /* Release the empty cache */ kmem_cache_destroy(ip_vs_conn_cachep); proc_net_remove(&init_net, "ip_vs_conn"); + proc_net_remove(&init_net, "ip_vs_conn_sync"); vfree(ip_vs_conn_tab); } diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c index 8fba20256f52..963981a9d501 100644 --- a/net/ipv4/ipvs/ip_vs_core.c +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -423,7 +423,7 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, and the destination is RTN_UNICAST (and not local), then create a cache_bypass connection entry */ if (sysctl_ip_vs_cache_bypass && svc->fwmark - && (inet_addr_type(iph->daddr) == RTN_UNICAST)) { + && (inet_addr_type(&init_net, iph->daddr) == RTN_UNICAST)) { int ret, cs; struct ip_vs_conn *cp; @@ -481,7 +481,7 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, /* - * It is hooked before NF_IP_PRI_NAT_SRC at the NF_IP_POST_ROUTING + * It is hooked before NF_IP_PRI_NAT_SRC at the NF_INET_POST_ROUTING * chain, and is used for VS/NAT. * It detects packets for VS/NAT connections and sends the packets * immediately. This can avoid that iptable_nat mangles the packets @@ -679,7 +679,7 @@ static inline int is_tcp_reset(const struct sk_buff *skb) } /* - * It is hooked at the NF_IP_FORWARD chain, used only for VS/NAT. + * It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT. * Check if outgoing packet belongs to the established ip_vs_conn, * rewrite addresses of the packet and send it on its way... */ @@ -814,7 +814,7 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) /* reassemble IP fragments */ if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { - if (ip_vs_gather_frags(skb, hooknum == NF_IP_LOCAL_IN ? + if (ip_vs_gather_frags(skb, hooknum == NF_INET_LOCAL_IN ? IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD)) return NF_STOLEN; } @@ -1003,12 +1003,12 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, /* - * It is hooked at the NF_IP_FORWARD chain, in order to catch ICMP + * It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP * related packets destined for 0.0.0.0/0. * When fwmark-based virtual service is used, such as transparent * cache cluster, TCP packets can be marked and routed to ip_vs_in, * but ICMP destined for 0.0.0.0/0 cannot not be easily marked and - * sent to ip_vs_in_icmp. So, catch them at the NF_IP_FORWARD chain + * sent to ip_vs_in_icmp. So, catch them at the NF_INET_FORWARD chain * and send them to ip_vs_in_icmp. */ static unsigned int @@ -1025,43 +1025,42 @@ ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb, } -/* After packet filtering, forward packet through VS/DR, VS/TUN, - or VS/NAT(change destination), so that filtering rules can be - applied to IPVS. */ -static struct nf_hook_ops ip_vs_in_ops = { - .hook = ip_vs_in, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_IP_LOCAL_IN, - .priority = 100, -}; - -/* After packet filtering, change source only for VS/NAT */ -static struct nf_hook_ops ip_vs_out_ops = { - .hook = ip_vs_out, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_IP_FORWARD, - .priority = 100, -}; - -/* After packet filtering (but before ip_vs_out_icmp), catch icmp - destined for 0.0.0.0/0, which is for incoming IPVS connections */ -static struct nf_hook_ops ip_vs_forward_icmp_ops = { - .hook = ip_vs_forward_icmp, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_IP_FORWARD, - .priority = 99, -}; - -/* Before the netfilter connection tracking, exit from POST_ROUTING */ -static struct nf_hook_ops ip_vs_post_routing_ops = { - .hook = ip_vs_post_routing, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_IP_POST_ROUTING, - .priority = NF_IP_PRI_NAT_SRC-1, +static struct nf_hook_ops ip_vs_ops[] __read_mostly = { + /* After packet filtering, forward packet through VS/DR, VS/TUN, + * or VS/NAT(change destination), so that filtering rules can be + * applied to IPVS. */ + { + .hook = ip_vs_in, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_LOCAL_IN, + .priority = 100, + }, + /* After packet filtering, change source only for VS/NAT */ + { + .hook = ip_vs_out, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_FORWARD, + .priority = 100, + }, + /* After packet filtering (but before ip_vs_out_icmp), catch icmp + * destined for 0.0.0.0/0, which is for incoming IPVS connections */ + { + .hook = ip_vs_forward_icmp, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_FORWARD, + .priority = 99, + }, + /* Before the netfilter connection tracking, exit from POST_ROUTING */ + { + .hook = ip_vs_post_routing, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP_PRI_NAT_SRC-1, + }, }; @@ -1092,37 +1091,15 @@ static int __init ip_vs_init(void) goto cleanup_app; } - ret = nf_register_hook(&ip_vs_in_ops); + ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); if (ret < 0) { - IP_VS_ERR("can't register in hook.\n"); + IP_VS_ERR("can't register hooks.\n"); goto cleanup_conn; } - ret = nf_register_hook(&ip_vs_out_ops); - if (ret < 0) { - IP_VS_ERR("can't register out hook.\n"); - goto cleanup_inops; - } - ret = nf_register_hook(&ip_vs_post_routing_ops); - if (ret < 0) { - IP_VS_ERR("can't register post_routing hook.\n"); - goto cleanup_outops; - } - ret = nf_register_hook(&ip_vs_forward_icmp_ops); - if (ret < 0) { - IP_VS_ERR("can't register forward_icmp hook.\n"); - goto cleanup_postroutingops; - } - IP_VS_INFO("ipvs loaded.\n"); return ret; - cleanup_postroutingops: - nf_unregister_hook(&ip_vs_post_routing_ops); - cleanup_outops: - nf_unregister_hook(&ip_vs_out_ops); - cleanup_inops: - nf_unregister_hook(&ip_vs_in_ops); cleanup_conn: ip_vs_conn_cleanup(); cleanup_app: @@ -1136,10 +1113,7 @@ static int __init ip_vs_init(void) static void __exit ip_vs_cleanup(void) { - nf_unregister_hook(&ip_vs_forward_icmp_ops); - nf_unregister_hook(&ip_vs_post_routing_ops); - nf_unregister_hook(&ip_vs_out_ops); - nf_unregister_hook(&ip_vs_in_ops); + nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); ip_vs_conn_cleanup(); ip_vs_app_cleanup(); ip_vs_protocol_cleanup(); diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c index 693d92490c11..94c5767c8e01 100644 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ b/net/ipv4/ipvs/ip_vs_ctl.c @@ -704,7 +704,7 @@ __ip_vs_update_dest(struct ip_vs_service *svc, conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE; /* check if local node and update the flags */ - if (inet_addr_type(udest->addr) == RTN_LOCAL) { + if (inet_addr_type(&init_net, udest->addr) == RTN_LOCAL) { conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK) | IP_VS_CONN_F_LOCALNODE; } @@ -756,7 +756,7 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest, EnterFunction(2); - atype = inet_addr_type(udest->addr); + atype = inet_addr_type(&init_net, udest->addr); if (atype != RTN_LOCAL && atype != RTN_UNICAST) return -EINVAL; @@ -1591,34 +1591,13 @@ static struct ctl_table vs_vars[] = { { .ctl_name = 0 } }; -static ctl_table vs_table[] = { - { - .procname = "vs", - .mode = 0555, - .child = vs_vars - }, - { .ctl_name = 0 } -}; - -static ctl_table ipvs_ipv4_table[] = { - { - .ctl_name = NET_IPV4, - .procname = "ipv4", - .mode = 0555, - .child = vs_table, - }, - { .ctl_name = 0 } -}; - -static ctl_table vs_root_table[] = { - { - .ctl_name = CTL_NET, - .procname = "net", - .mode = 0555, - .child = ipvs_ipv4_table, - }, - { .ctl_name = 0 } +struct ctl_path net_vs_ctl_path[] = { + { .procname = "net", .ctl_name = CTL_NET, }, + { .procname = "ipv4", .ctl_name = NET_IPV4, }, + { .procname = "vs", }, + { } }; +EXPORT_SYMBOL_GPL(net_vs_ctl_path); static struct ctl_table_header * sysctl_header; @@ -2345,7 +2324,7 @@ int ip_vs_control_init(void) proc_net_fops_create(&init_net, "ip_vs", 0, &ip_vs_info_fops); proc_net_fops_create(&init_net, "ip_vs_stats",0, &ip_vs_stats_fops); - sysctl_header = register_sysctl_table(vs_root_table); + sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars); /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */ for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { diff --git a/net/ipv4/ipvs/ip_vs_est.c b/net/ipv4/ipvs/ip_vs_est.c index 7d68b80c4c19..dfa0d713c801 100644 --- a/net/ipv4/ipvs/ip_vs_est.c +++ b/net/ipv4/ipvs/ip_vs_est.c @@ -18,6 +18,7 @@ #include <linux/slab.h> #include <linux/types.h> #include <linux/interrupt.h> +#include <linux/sysctl.h> #include <net/ip_vs.h> @@ -146,9 +147,8 @@ int ip_vs_new_estimator(struct ip_vs_stats *stats) write_lock_bh(&est_lock); est->next = est_list; if (est->next == NULL) { - init_timer(&est_timer); + setup_timer(&est_timer, estimation_timer, 0); est_timer.expires = jiffies + 2*HZ; - est_timer.function = estimation_timer; add_timer(&est_timer); } est_list = est; diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c index ad89644ef5d2..3888642706ad 100644 --- a/net/ipv4/ipvs/ip_vs_lblc.c +++ b/net/ipv4/ipvs/ip_vs_lblc.c @@ -123,35 +123,6 @@ static ctl_table vs_vars_table[] = { { .ctl_name = 0 } }; -static ctl_table vs_table[] = { - { - .procname = "vs", - .mode = 0555, - .child = vs_vars_table - }, - { .ctl_name = 0 } -}; - -static ctl_table ipvs_ipv4_table[] = { - { - .ctl_name = NET_IPV4, - .procname = "ipv4", - .mode = 0555, - .child = vs_table - }, - { .ctl_name = 0 } -}; - -static ctl_table lblc_root_table[] = { - { - .ctl_name = CTL_NET, - .procname = "net", - .mode = 0555, - .child = ipvs_ipv4_table - }, - { .ctl_name = 0 } -}; - static struct ctl_table_header * sysctl_header; /* @@ -391,9 +362,8 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) /* * Hook periodic timer for garbage collection */ - init_timer(&tbl->periodic_timer); - tbl->periodic_timer.data = (unsigned long)tbl; - tbl->periodic_timer.function = ip_vs_lblc_check_expire; + setup_timer(&tbl->periodic_timer, ip_vs_lblc_check_expire, + (unsigned long)tbl); tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL; add_timer(&tbl->periodic_timer); @@ -583,7 +553,7 @@ static int __init ip_vs_lblc_init(void) int ret; INIT_LIST_HEAD(&ip_vs_lblc_scheduler.n_list); - sysctl_header = register_sysctl_table(lblc_root_table); + sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table); ret = register_ip_vs_scheduler(&ip_vs_lblc_scheduler); if (ret) unregister_sysctl_table(sysctl_header); diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c index 2a5ed85a3352..daa260eb21cf 100644 --- a/net/ipv4/ipvs/ip_vs_lblcr.c +++ b/net/ipv4/ipvs/ip_vs_lblcr.c @@ -311,35 +311,6 @@ static ctl_table vs_vars_table[] = { { .ctl_name = 0 } }; -static ctl_table vs_table[] = { - { - .procname = "vs", - .mode = 0555, - .child = vs_vars_table - }, - { .ctl_name = 0 } -}; - -static ctl_table ipvs_ipv4_table[] = { - { - .ctl_name = NET_IPV4, - .procname = "ipv4", - .mode = 0555, - .child = vs_table - }, - { .ctl_name = 0 } -}; - -static ctl_table lblcr_root_table[] = { - { - .ctl_name = CTL_NET, - .procname = "net", - .mode = 0555, - .child = ipvs_ipv4_table - }, - { .ctl_name = 0 } -}; - static struct ctl_table_header * sysctl_header; /* @@ -575,9 +546,8 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc) /* * Hook periodic timer for garbage collection */ - init_timer(&tbl->periodic_timer); - tbl->periodic_timer.data = (unsigned long)tbl; - tbl->periodic_timer.function = ip_vs_lblcr_check_expire; + setup_timer(&tbl->periodic_timer, ip_vs_lblcr_check_expire, + (unsigned long)tbl); tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL; add_timer(&tbl->periodic_timer); @@ -772,7 +742,7 @@ static int __init ip_vs_lblcr_init(void) int ret; INIT_LIST_HEAD(&ip_vs_lblcr_scheduler.n_list); - sysctl_header = register_sysctl_table(lblcr_root_table); + sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table); ret = register_ip_vs_scheduler(&ip_vs_lblcr_scheduler); if (ret) unregister_sysctl_table(sysctl_header); diff --git a/net/ipv4/ipvs/ip_vs_proto.c b/net/ipv4/ipvs/ip_vs_proto.c index c0e11ec8f0f9..dde28a250d92 100644 --- a/net/ipv4/ipvs/ip_vs_proto.c +++ b/net/ipv4/ipvs/ip_vs_proto.c @@ -165,7 +165,7 @@ ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp, ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); if (ih == NULL) sprintf(buf, "%s TRUNCATED", pp->name); - else if (ih->frag_off & __constant_htons(IP_OFFSET)) + else if (ih->frag_off & htons(IP_OFFSET)) sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u frag", pp->name, NIPQUAD(ih->saddr), NIPQUAD(ih->daddr)); diff --git a/net/ipv4/ipvs/ip_vs_proto_esp.c b/net/ipv4/ipvs/ip_vs_proto_esp.c index c36ccf057a19..aef0d3ee8e44 100644 --- a/net/ipv4/ipvs/ip_vs_proto_esp.c +++ b/net/ipv4/ipvs/ip_vs_proto_esp.c @@ -52,15 +52,15 @@ esp_conn_in_get(const struct sk_buff *skb, if (likely(!inverse)) { cp = ip_vs_conn_in_get(IPPROTO_UDP, iph->saddr, - __constant_htons(PORT_ISAKMP), + htons(PORT_ISAKMP), iph->daddr, - __constant_htons(PORT_ISAKMP)); + htons(PORT_ISAKMP)); } else { cp = ip_vs_conn_in_get(IPPROTO_UDP, iph->daddr, - __constant_htons(PORT_ISAKMP), + htons(PORT_ISAKMP), iph->saddr, - __constant_htons(PORT_ISAKMP)); + htons(PORT_ISAKMP)); } if (!cp) { @@ -89,15 +89,15 @@ esp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, if (likely(!inverse)) { cp = ip_vs_conn_out_get(IPPROTO_UDP, iph->saddr, - __constant_htons(PORT_ISAKMP), + htons(PORT_ISAKMP), iph->daddr, - __constant_htons(PORT_ISAKMP)); + htons(PORT_ISAKMP)); } else { cp = ip_vs_conn_out_get(IPPROTO_UDP, iph->daddr, - __constant_htons(PORT_ISAKMP), + htons(PORT_ISAKMP), iph->saddr, - __constant_htons(PORT_ISAKMP)); + htons(PORT_ISAKMP)); } if (!cp) { diff --git a/net/ipv4/ipvs/ip_vs_sched.c b/net/ipv4/ipvs/ip_vs_sched.c index 432235861908..121a32b1b756 100644 --- a/net/ipv4/ipvs/ip_vs_sched.c +++ b/net/ipv4/ipvs/ip_vs_sched.c @@ -24,6 +24,7 @@ #include <linux/interrupt.h> #include <asm/string.h> #include <linux/kmod.h> +#include <linux/sysctl.h> #include <net/ip_vs.h> diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c index bd930efc18da..948378d0a755 100644 --- a/net/ipv4/ipvs/ip_vs_sync.c +++ b/net/ipv4/ipvs/ip_vs_sync.c @@ -305,10 +305,11 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen) p = (char *)buffer + sizeof(struct ip_vs_sync_mesg); for (i=0; i<m->nr_conns; i++) { - unsigned flags; + unsigned flags, state; s = (struct ip_vs_sync_conn *)p; - flags = ntohs(s->flags); + flags = ntohs(s->flags) | IP_VS_CONN_F_SYNC; + state = ntohs(s->state); if (!(flags & IP_VS_CONN_F_TEMPLATE)) cp = ip_vs_conn_in_get(s->protocol, s->caddr, s->cport, @@ -326,6 +327,13 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen) dest = ip_vs_find_dest(s->daddr, s->dport, s->vaddr, s->vport, s->protocol); + /* Set the approprite ativity flag */ + if (s->protocol == IPPROTO_TCP) { + if (state != IP_VS_TCP_S_ESTABLISHED) + flags |= IP_VS_CONN_F_INACTIVE; + else + flags &= ~IP_VS_CONN_F_INACTIVE; + } cp = ip_vs_conn_new(s->protocol, s->caddr, s->cport, s->vaddr, s->vport, @@ -337,7 +345,7 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen) IP_VS_ERR("ip_vs_conn_new failed\n"); return; } - cp->state = ntohs(s->state); + cp->state = state; } else if (!cp->dest) { dest = ip_vs_try_bind_dest(cp); if (!dest) { @@ -346,8 +354,22 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen) cp->flags = flags | IP_VS_CONN_F_HASHED; } else atomic_dec(&dest->refcnt); - } /* Note that we don't touch its state and flags - if it is a normal entry. */ + } else if ((cp->dest) && (cp->protocol == IPPROTO_TCP) && + (cp->state != state)) { + /* update active/inactive flag for the connection */ + dest = cp->dest; + if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && + (state != IP_VS_TCP_S_ESTABLISHED)) { + atomic_dec(&dest->activeconns); + atomic_inc(&dest->inactconns); + cp->flags |= IP_VS_CONN_F_INACTIVE; + } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) && + (state == IP_VS_TCP_S_ESTABLISHED)) { + atomic_inc(&dest->activeconns); + atomic_dec(&dest->inactconns); + cp->flags &= ~IP_VS_CONN_F_INACTIVE; + } + } if (flags & IP_VS_CONN_F_SEQ_MASK) { opt = (struct ip_vs_sync_conn_options *)&s[1]; @@ -357,7 +379,7 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen) p += SIMPLE_CONN_SIZE; atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]); - cp->state = ntohs(s->state); + cp->state = state; pp = ip_vs_proto_get(s->protocol); cp->timeout = pp->timeout_table[cp->state]; ip_vs_conn_put(cp); diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c index 7c074e386c17..f63006caea03 100644 --- a/net/ipv4/ipvs/ip_vs_xmit.c +++ b/net/ipv4/ipvs/ip_vs_xmit.c @@ -16,8 +16,8 @@ */ #include <linux/kernel.h> -#include <linux/ip.h> #include <linux/tcp.h> /* for tcphdr */ +#include <net/ip.h> #include <net/tcp.h> /* for csum_tcpudp_magic */ #include <net/udp.h> #include <net/icmp.h> /* for icmp_send */ @@ -59,7 +59,7 @@ __ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie) return dst; } -static inline struct rtable * +static struct rtable * __ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos) { struct rtable *rt; /* Route to the other host */ @@ -78,7 +78,7 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos) .tos = rtos, } }, }; - if (ip_route_output_key(&rt, &fl)) { + if (ip_route_output_key(&init_net, &rt, &fl)) { spin_unlock(&dest->dst_lock); IP_VS_DBG_RL("ip_route_output error, " "dest: %u.%u.%u.%u\n", @@ -101,7 +101,7 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos) .tos = rtos, } }, }; - if (ip_route_output_key(&rt, &fl)) { + if (ip_route_output_key(&init_net, &rt, &fl)) { IP_VS_DBG_RL("ip_route_output error, dest: " "%u.%u.%u.%u\n", NIPQUAD(cp->daddr)); return NULL; @@ -129,7 +129,7 @@ ip_vs_dst_reset(struct ip_vs_dest *dest) do { \ (skb)->ipvs_property = 1; \ skb_forward_csum(skb); \ - NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, (skb), NULL, \ + NF_HOOK(PF_INET, NF_INET_LOCAL_OUT, (skb), NULL, \ (rt)->u.dst.dev, dst_output); \ } while (0) @@ -170,7 +170,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); - if (ip_route_output_key(&rt, &fl)) { + if (ip_route_output_key(&init_net, &rt, &fl)) { IP_VS_DBG_RL("ip_vs_bypass_xmit(): ip_route_output error, " "dest: %u.%u.%u.%u\n", NIPQUAD(iph->daddr)); goto tx_error_icmp; @@ -406,14 +406,12 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, iph->daddr = rt->rt_dst; iph->saddr = rt->rt_src; iph->ttl = old_iph->ttl; - iph->tot_len = htons(skb->len); ip_select_ident(iph, &rt->u.dst, NULL); - ip_send_check(iph); /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT(skb, rt); + ip_local_out(skb); LeaveFunction(10); diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index 5539debf4973..9a904c6c0dc8 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -7,6 +7,7 @@ #include <net/route.h> #include <net/xfrm.h> #include <net/ip.h> +#include <net/netfilter/nf_queue.h> /* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) @@ -18,12 +19,12 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) unsigned int hh_len; unsigned int type; - type = inet_addr_type(iph->saddr); + type = inet_addr_type(&init_net, iph->saddr); if (addr_type == RTN_UNSPEC) addr_type = type; /* some non-standard hacks like ipt_REJECT.c:send_reset() can cause - * packets with foreign saddr to appear on the NF_IP_LOCAL_OUT hook. + * packets with foreign saddr to appear on the NF_INET_LOCAL_OUT hook. */ if (addr_type == RTN_LOCAL) { fl.nl_u.ip4_u.daddr = iph->daddr; @@ -32,7 +33,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) fl.nl_u.ip4_u.tos = RT_TOS(iph->tos); fl.oif = skb->sk ? skb->sk->sk_bound_dev_if : 0; fl.mark = skb->mark; - if (ip_route_output_key(&rt, &fl) != 0) + if (ip_route_output_key(&init_net, &rt, &fl) != 0) return -1; /* Drop old route. */ @@ -42,7 +43,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) /* non-local src, find valid iif to satisfy * rp-filter when calling ip_route_input. */ fl.nl_u.ip4_u.daddr = iph->saddr; - if (ip_route_output_key(&rt, &fl) != 0) + if (ip_route_output_key(&init_net, &rt, &fl) != 0) return -1; odst = skb->dst; @@ -122,11 +123,12 @@ struct ip_rt_info { u_int8_t tos; }; -static void nf_ip_saveroute(const struct sk_buff *skb, struct nf_info *info) +static void nf_ip_saveroute(const struct sk_buff *skb, + struct nf_queue_entry *entry) { - struct ip_rt_info *rt_info = nf_info_reroute(info); + struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry); - if (info->hook == NF_IP_LOCAL_OUT) { + if (entry->hook == NF_INET_LOCAL_OUT) { const struct iphdr *iph = ip_hdr(skb); rt_info->tos = iph->tos; @@ -135,11 +137,12 @@ static void nf_ip_saveroute(const struct sk_buff *skb, struct nf_info *info) } } -static int nf_ip_reroute(struct sk_buff *skb, const struct nf_info *info) +static int nf_ip_reroute(struct sk_buff *skb, + const struct nf_queue_entry *entry) { - const struct ip_rt_info *rt_info = nf_info_reroute(info); + const struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry); - if (info->hook == NF_IP_LOCAL_OUT) { + if (entry->hook == NF_INET_LOCAL_OUT) { const struct iphdr *iph = ip_hdr(skb); if (!(iph->tos == rt_info->tos @@ -158,7 +161,7 @@ __sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook, switch (skb->ip_summed) { case CHECKSUM_COMPLETE: - if (hook != NF_IP_PRE_ROUTING && hook != NF_IP_LOCAL_IN) + if (hook != NF_INET_PRE_ROUTING && hook != NF_INET_LOCAL_IN) break; if ((protocol == 0 && !csum_fold(skb->csum)) || !csum_tcpudp_magic(iph->saddr, iph->daddr, @@ -182,9 +185,15 @@ __sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook, EXPORT_SYMBOL(nf_ip_checksum); -static struct nf_afinfo nf_ip_afinfo = { +static int nf_ip_route(struct dst_entry **dst, struct flowi *fl) +{ + return ip_route_output_key(&init_net, (struct rtable **)dst, fl); +} + +static const struct nf_afinfo nf_ip_afinfo = { .family = AF_INET, .checksum = nf_ip_checksum, + .route = nf_ip_route, .saveroute = nf_ip_saveroute, .reroute = nf_ip_reroute, .route_key_size = sizeof(struct ip_rt_info), @@ -202,3 +211,13 @@ static void ipv4_netfilter_fini(void) module_init(ipv4_netfilter_init); module_exit(ipv4_netfilter_fini); + +#ifdef CONFIG_SYSCTL +struct ctl_path nf_net_ipv4_netfilter_sysctl_path[] = { + { .procname = "net", .ctl_name = CTL_NET, }, + { .procname = "ipv4", .ctl_name = NET_IPV4, }, + { .procname = "netfilter", .ctl_name = NET_IPV4_NETFILTER, }, + { } +}; +EXPORT_SYMBOL_GPL(nf_net_ipv4_netfilter_sysctl_path); +#endif /* CONFIG_SYSCTL */ diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 9aca9c55687c..9a077cb24798 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -8,6 +8,7 @@ menu "IP: Netfilter Configuration" config NF_CONNTRACK_IPV4 tristate "IPv4 connection tracking support (required for NAT)" depends on NF_CONNTRACK + default m if NETFILTER_ADVANCED=n ---help--- Connection tracking keeps a record of what packets have passed through your machine, in order to figure out how they are related @@ -32,6 +33,7 @@ config NF_CONNTRACK_PROC_COMPAT config IP_NF_QUEUE tristate "IP Userspace queueing via NETLINK (OBSOLETE)" + depends on NETFILTER_ADVANCED help Netfilter has the ability to queue packets to user space: the netlink device can be used to access them using this driver. @@ -44,6 +46,7 @@ config IP_NF_QUEUE config IP_NF_IPTABLES tristate "IP tables support (required for filtering/masq/NAT)" + default m if NETFILTER_ADVANCED=n select NETFILTER_XTABLES help iptables is a general, extensible packet identification framework. @@ -54,27 +57,10 @@ config IP_NF_IPTABLES To compile it as a module, choose M here. If unsure, say N. # The matches. -config IP_NF_MATCH_IPRANGE - tristate "IP range match support" - depends on IP_NF_IPTABLES - help - This option makes possible to match IP addresses against IP address - ranges. - - To compile it as a module, choose M here. If unsure, say N. - -config IP_NF_MATCH_TOS - tristate "TOS match support" - depends on IP_NF_IPTABLES - help - TOS matching allows you to match packets based on the Type Of - Service fields of the IP packet. - - To compile it as a module, choose M here. If unsure, say N. - config IP_NF_MATCH_RECENT - tristate "recent match support" + tristate '"recent" match support' depends on IP_NF_IPTABLES + depends on NETFILTER_ADVANCED help This match is used for creating one or many lists of recently used addresses and then matching against that/those list(s). @@ -85,8 +71,9 @@ config IP_NF_MATCH_RECENT To compile it as a module, choose M here. If unsure, say N. config IP_NF_MATCH_ECN - tristate "ECN match support" + tristate '"ecn" match support' depends on IP_NF_IPTABLES + depends on NETFILTER_ADVANCED help This option adds a `ECN' match, which allows you to match against the IPv4 and TCP header ECN fields. @@ -94,8 +81,9 @@ config IP_NF_MATCH_ECN To compile it as a module, choose M here. If unsure, say N. config IP_NF_MATCH_AH - tristate "AH match support" + tristate '"ah" match support' depends on IP_NF_IPTABLES + depends on NETFILTER_ADVANCED help This match extension allows you to match a range of SPIs inside AH header of IPSec packets. @@ -103,30 +91,23 @@ config IP_NF_MATCH_AH To compile it as a module, choose M here. If unsure, say N. config IP_NF_MATCH_TTL - tristate "TTL match support" + tristate '"ttl" match support' depends on IP_NF_IPTABLES + depends on NETFILTER_ADVANCED help This adds CONFIG_IP_NF_MATCH_TTL option, which enabled the user to match packets by their TTL value. To compile it as a module, choose M here. If unsure, say N. -config IP_NF_MATCH_OWNER - tristate "Owner match support" - depends on IP_NF_IPTABLES - help - Packet owner matching allows you to match locally-generated packets - based on who created them: the user, group, process or session. - - To compile it as a module, choose M here. If unsure, say N. - config IP_NF_MATCH_ADDRTYPE - tristate 'address type match support' + tristate '"addrtype" address type match support' depends on IP_NF_IPTABLES + depends on NETFILTER_ADVANCED help This option allows you to match what routing thinks of an address, eg. UNICAST, LOCAL, BROADCAST, ... - + If you want to compile it as a module, say M here and read <file:Documentation/kbuild/modules.txt>. If unsure, say `N'. @@ -134,6 +115,7 @@ config IP_NF_MATCH_ADDRTYPE config IP_NF_FILTER tristate "Packet filtering" depends on IP_NF_IPTABLES + default m if NETFILTER_ADVANCED=n help Packet filtering defines a table `filter', which has a series of rules for simple packet filtering at local input, forwarding and @@ -144,6 +126,7 @@ config IP_NF_FILTER config IP_NF_TARGET_REJECT tristate "REJECT target support" depends on IP_NF_FILTER + default m if NETFILTER_ADVANCED=n help The REJECT target allows a filtering rule to specify that an ICMP error should be issued in response to an incoming packet, rather @@ -154,6 +137,7 @@ config IP_NF_TARGET_REJECT config IP_NF_TARGET_LOG tristate "LOG target support" depends on IP_NF_IPTABLES + default m if NETFILTER_ADVANCED=n help This option adds a `LOG' target, which allows you to create rules in any iptables table which records the packet header to the syslog. @@ -163,6 +147,7 @@ config IP_NF_TARGET_LOG config IP_NF_TARGET_ULOG tristate "ULOG target support" depends on IP_NF_IPTABLES + default m if NETFILTER_ADVANCED=n ---help--- This option enables the old IPv4-only "ipt_ULOG" implementation @@ -183,6 +168,7 @@ config IP_NF_TARGET_ULOG config NF_NAT tristate "Full NAT" depends on IP_NF_IPTABLES && NF_CONNTRACK_IPV4 + default m if NETFILTER_ADVANCED=n help The Full NAT option allows masquerading, port forwarding and other forms of full Network Address Port Translation. It is controlled by @@ -198,6 +184,7 @@ config NF_NAT_NEEDED config IP_NF_TARGET_MASQUERADE tristate "MASQUERADE target support" depends on NF_NAT + default m if NETFILTER_ADVANCED=n help Masquerading is a special case of NAT: all outgoing connections are changed to seem to come from a particular interface's address, and @@ -210,6 +197,7 @@ config IP_NF_TARGET_MASQUERADE config IP_NF_TARGET_REDIRECT tristate "REDIRECT target support" depends on NF_NAT + depends on NETFILTER_ADVANCED help REDIRECT is a special case of NAT: all incoming connections are mapped onto the incoming interface's address, causing the packets to @@ -221,6 +209,7 @@ config IP_NF_TARGET_REDIRECT config IP_NF_TARGET_NETMAP tristate "NETMAP target support" depends on NF_NAT + depends on NETFILTER_ADVANCED help NETMAP is an implementation of static 1:1 NAT mapping of network addresses. It maps the network address part, while keeping the host @@ -229,18 +218,10 @@ config IP_NF_TARGET_NETMAP To compile it as a module, choose M here. If unsure, say N. -config IP_NF_TARGET_SAME - tristate "SAME target support (OBSOLETE)" - depends on NF_NAT - help - This option adds a `SAME' target, which works like the standard SNAT - target, but attempts to give clients the same IP for all connections. - - To compile it as a module, choose M here. If unsure, say N. - config NF_NAT_SNMP_BASIC - tristate "Basic SNMP-ALG support (EXPERIMENTAL)" - depends on EXPERIMENTAL && NF_NAT + tristate "Basic SNMP-ALG support" + depends on NF_NAT + depends on NETFILTER_ADVANCED ---help--- This module implements an Application Layer Gateway (ALG) for @@ -304,6 +285,7 @@ config NF_NAT_SIP config IP_NF_MANGLE tristate "Packet mangling" depends on IP_NF_IPTABLES + default m if NETFILTER_ADVANCED=n help This option adds a `mangle' table to iptables: see the man page for iptables(8). This table is used for various packet alterations @@ -311,19 +293,10 @@ config IP_NF_MANGLE To compile it as a module, choose M here. If unsure, say N. -config IP_NF_TARGET_TOS - tristate "TOS target support" - depends on IP_NF_MANGLE - help - This option adds a `TOS' target, which allows you to create rules in - the `mangle' table which alter the Type Of Service field of an IP - packet prior to routing. - - To compile it as a module, choose M here. If unsure, say N. - config IP_NF_TARGET_ECN tristate "ECN target support" depends on IP_NF_MANGLE + depends on NETFILTER_ADVANCED ---help--- This option adds a `ECN' target, which can be used in the iptables mangle table. @@ -338,6 +311,7 @@ config IP_NF_TARGET_ECN config IP_NF_TARGET_TTL tristate 'TTL target support' depends on IP_NF_MANGLE + depends on NETFILTER_ADVANCED help This option adds a `TTL' target, which enables the user to modify the TTL value of the IP header. @@ -353,6 +327,7 @@ config IP_NF_TARGET_CLUSTERIP tristate "CLUSTERIP target support (EXPERIMENTAL)" depends on IP_NF_MANGLE && EXPERIMENTAL depends on NF_CONNTRACK_IPV4 + depends on NETFILTER_ADVANCED select NF_CONNTRACK_MARK help The CLUSTERIP target allows you to build load-balancing clusters of @@ -365,6 +340,7 @@ config IP_NF_TARGET_CLUSTERIP config IP_NF_RAW tristate 'raw table support (required for NOTRACK/TRACE)' depends on IP_NF_IPTABLES + depends on NETFILTER_ADVANCED help This option adds a `raw' table to iptables. This table is the very first in the netfilter framework and hooks in at the PREROUTING @@ -377,6 +353,7 @@ config IP_NF_RAW config IP_NF_ARPTABLES tristate "ARP tables support" select NETFILTER_XTABLES + depends on NETFILTER_ADVANCED help arptables is a general, extensible packet identification framework. The ARP packet filtering and mangling (manipulation)subsystems diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 7456833d6ade..0c7dc78a62e9 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -44,10 +44,7 @@ obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o obj-$(CONFIG_IP_NF_MATCH_AH) += ipt_ah.o obj-$(CONFIG_IP_NF_MATCH_ECN) += ipt_ecn.o -obj-$(CONFIG_IP_NF_MATCH_IPRANGE) += ipt_iprange.o -obj-$(CONFIG_IP_NF_MATCH_OWNER) += ipt_owner.o obj-$(CONFIG_IP_NF_MATCH_RECENT) += ipt_recent.o -obj-$(CONFIG_IP_NF_MATCH_TOS) += ipt_tos.o obj-$(CONFIG_IP_NF_MATCH_TTL) += ipt_ttl.o # targets @@ -58,8 +55,6 @@ obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o obj-$(CONFIG_IP_NF_TARGET_NETMAP) += ipt_NETMAP.o obj-$(CONFIG_IP_NF_TARGET_REDIRECT) += ipt_REDIRECT.o obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o -obj-$(CONFIG_IP_NF_TARGET_SAME) += ipt_SAME.o -obj-$(CONFIG_IP_NF_TARGET_TOS) += ipt_TOS.o obj-$(CONFIG_IP_NF_TARGET_TTL) += ipt_TTL.o obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 2909c92ecd99..b4a810c28ac8 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -19,9 +19,10 @@ #include <linux/proc_fs.h> #include <linux/module.h> #include <linux/init.h> - -#include <asm/uaccess.h> #include <linux/mutex.h> +#include <linux/err.h> +#include <net/compat.h> +#include <asm/uaccess.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_arp/arp_tables.h> @@ -83,7 +84,7 @@ static inline int arp_packet_match(const struct arphdr *arphdr, __be32 src_ipaddr, tgt_ipaddr; int i, ret; -#define FWINV(bool,invflg) ((bool) ^ !!(arpinfo->invflags & invflg)) +#define FWINV(bool, invflg) ((bool) ^ !!(arpinfo->invflags & (invflg))) if (FWINV((arphdr->ar_op & arpinfo->arpop_mask) != arpinfo->arpop, ARPT_INV_ARPOP)) { @@ -179,6 +180,7 @@ static inline int arp_packet_match(const struct arphdr *arphdr, } return 1; +#undef FWINV } static inline int arp_checkentry(const struct arpt_arp *arp) @@ -435,29 +437,9 @@ static int mark_source_chains(struct xt_table_info *newinfo, return 1; } -static inline int standard_check(const struct arpt_entry_target *t, - unsigned int max_offset) -{ - /* Check standard info. */ - if (t->u.target_size - != ARPT_ALIGN(sizeof(struct arpt_standard_target))) { - duprintf("arpt_standard_check: target size %u != %Zu\n", - t->u.target_size, - ARPT_ALIGN(sizeof(struct arpt_standard_target))); - return 0; - } - - return 1; -} - -static struct arpt_target arpt_standard_target; - -static inline int check_entry(struct arpt_entry *e, const char *name, unsigned int size, - unsigned int *i) +static inline int check_entry(struct arpt_entry *e, const char *name) { struct arpt_entry_target *t; - struct arpt_target *target; - int ret; if (!arp_checkentry(&e->arp)) { duprintf("arp_tables: arp check failed %p %s.\n", e, name); @@ -471,35 +453,57 @@ static inline int check_entry(struct arpt_entry *e, const char *name, unsigned i if (e->target_offset + t->u.target_size > e->next_offset) return -EINVAL; + return 0; +} + +static inline int check_target(struct arpt_entry *e, const char *name) +{ + struct arpt_entry_target *t; + struct arpt_target *target; + int ret; + + t = arpt_get_target(e); + target = t->u.kernel.target; + + ret = xt_check_target(target, NF_ARP, t->u.target_size - sizeof(*t), + name, e->comefrom, 0, 0); + if (!ret && t->u.kernel.target->checkentry + && !t->u.kernel.target->checkentry(name, e, target, t->data, + e->comefrom)) { + duprintf("arp_tables: check failed for `%s'.\n", + t->u.kernel.target->name); + ret = -EINVAL; + } + return ret; +} + +static inline int +find_check_entry(struct arpt_entry *e, const char *name, unsigned int size, + unsigned int *i) +{ + struct arpt_entry_target *t; + struct arpt_target *target; + int ret; + + ret = check_entry(e, name); + if (ret) + return ret; + + t = arpt_get_target(e); target = try_then_request_module(xt_find_target(NF_ARP, t->u.user.name, t->u.user.revision), "arpt_%s", t->u.user.name); if (IS_ERR(target) || !target) { - duprintf("check_entry: `%s' not found\n", t->u.user.name); + duprintf("find_check_entry: `%s' not found\n", t->u.user.name); ret = target ? PTR_ERR(target) : -ENOENT; goto out; } t->u.kernel.target = target; - ret = xt_check_target(target, NF_ARP, t->u.target_size - sizeof(*t), - name, e->comefrom, 0, 0); + ret = check_target(e, name); if (ret) goto err; - if (t->u.kernel.target == &arpt_standard_target) { - if (!standard_check(t, size)) { - ret = -EINVAL; - goto err; - } - } else if (t->u.kernel.target->checkentry - && !t->u.kernel.target->checkentry(name, e, target, t->data, - e->comefrom)) { - duprintf("arp_tables: check failed for `%s'.\n", - t->u.kernel.target->name); - ret = -EINVAL; - goto err; - } - (*i)++; return 0; err: @@ -633,7 +637,7 @@ static int translate_table(const char *name, /* Finally, each sanity check must pass */ i = 0; ret = ARPT_ENTRY_ITERATE(entry0, newinfo->size, - check_entry, name, size, &i); + find_check_entry, name, size, &i); if (ret != 0) { ARPT_ENTRY_ITERATE(entry0, newinfo->size, @@ -704,16 +708,11 @@ static void get_counters(const struct xt_table_info *t, } } -static int copy_entries_to_user(unsigned int total_size, - struct arpt_table *table, - void __user *userptr) +static inline struct xt_counters *alloc_counters(struct arpt_table *table) { - unsigned int off, num, countersize; - struct arpt_entry *e; + unsigned int countersize; struct xt_counters *counters; struct xt_table_info *private = table->private; - int ret = 0; - void *loc_cpu_entry; /* We need atomic snapshot of counters: rest doesn't change * (other than comefrom, which userspace doesn't care @@ -723,13 +722,31 @@ static int copy_entries_to_user(unsigned int total_size, counters = vmalloc_node(countersize, numa_node_id()); if (counters == NULL) - return -ENOMEM; + return ERR_PTR(-ENOMEM); /* First, sum counters... */ write_lock_bh(&table->lock); get_counters(private, counters); write_unlock_bh(&table->lock); + return counters; +} + +static int copy_entries_to_user(unsigned int total_size, + struct arpt_table *table, + void __user *userptr) +{ + unsigned int off, num; + struct arpt_entry *e; + struct xt_counters *counters; + struct xt_table_info *private = table->private; + int ret = 0; + void *loc_cpu_entry; + + counters = alloc_counters(table); + if (IS_ERR(counters)) + return PTR_ERR(counters); + loc_cpu_entry = private->entries[raw_smp_processor_id()]; /* ... then copy entire thing ... */ if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) { @@ -767,23 +784,159 @@ static int copy_entries_to_user(unsigned int total_size, return ret; } -static int get_entries(const struct arpt_get_entries *entries, - struct arpt_get_entries __user *uptr) +#ifdef CONFIG_COMPAT +static void compat_standard_from_user(void *dst, void *src) +{ + int v = *(compat_int_t *)src; + + if (v > 0) + v += xt_compat_calc_jump(NF_ARP, v); + memcpy(dst, &v, sizeof(v)); +} + +static int compat_standard_to_user(void __user *dst, void *src) { + compat_int_t cv = *(int *)src; + + if (cv > 0) + cv -= xt_compat_calc_jump(NF_ARP, cv); + return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0; +} + +static int compat_calc_entry(struct arpt_entry *e, + const struct xt_table_info *info, + void *base, struct xt_table_info *newinfo) +{ + struct arpt_entry_target *t; + unsigned int entry_offset; + int off, i, ret; + + off = sizeof(struct arpt_entry) - sizeof(struct compat_arpt_entry); + entry_offset = (void *)e - base; + + t = arpt_get_target(e); + off += xt_compat_target_offset(t->u.kernel.target); + newinfo->size -= off; + ret = xt_compat_add_offset(NF_ARP, entry_offset, off); + if (ret) + return ret; + + for (i = 0; i < NF_ARP_NUMHOOKS; i++) { + if (info->hook_entry[i] && + (e < (struct arpt_entry *)(base + info->hook_entry[i]))) + newinfo->hook_entry[i] -= off; + if (info->underflow[i] && + (e < (struct arpt_entry *)(base + info->underflow[i]))) + newinfo->underflow[i] -= off; + } + return 0; +} + +static int compat_table_info(const struct xt_table_info *info, + struct xt_table_info *newinfo) +{ + void *loc_cpu_entry; + + if (!newinfo || !info) + return -EINVAL; + + /* we dont care about newinfo->entries[] */ + memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); + newinfo->initial_entries = 0; + loc_cpu_entry = info->entries[raw_smp_processor_id()]; + return ARPT_ENTRY_ITERATE(loc_cpu_entry, info->size, + compat_calc_entry, info, loc_cpu_entry, + newinfo); +} +#endif + +static int get_info(void __user *user, int *len, int compat) +{ + char name[ARPT_TABLE_MAXNAMELEN]; + struct arpt_table *t; int ret; + + if (*len != sizeof(struct arpt_getinfo)) { + duprintf("length %u != %Zu\n", *len, + sizeof(struct arpt_getinfo)); + return -EINVAL; + } + + if (copy_from_user(name, user, sizeof(name)) != 0) + return -EFAULT; + + name[ARPT_TABLE_MAXNAMELEN-1] = '\0'; +#ifdef CONFIG_COMPAT + if (compat) + xt_compat_lock(NF_ARP); +#endif + t = try_then_request_module(xt_find_table_lock(NF_ARP, name), + "arptable_%s", name); + if (t && !IS_ERR(t)) { + struct arpt_getinfo info; + struct xt_table_info *private = t->private; + +#ifdef CONFIG_COMPAT + if (compat) { + struct xt_table_info tmp; + ret = compat_table_info(private, &tmp); + xt_compat_flush_offsets(NF_ARP); + private = &tmp; + } +#endif + info.valid_hooks = t->valid_hooks; + memcpy(info.hook_entry, private->hook_entry, + sizeof(info.hook_entry)); + memcpy(info.underflow, private->underflow, + sizeof(info.underflow)); + info.num_entries = private->number; + info.size = private->size; + strcpy(info.name, name); + + if (copy_to_user(user, &info, *len) != 0) + ret = -EFAULT; + else + ret = 0; + xt_table_unlock(t); + module_put(t->me); + } else + ret = t ? PTR_ERR(t) : -ENOENT; +#ifdef CONFIG_COMPAT + if (compat) + xt_compat_unlock(NF_ARP); +#endif + return ret; +} + +static int get_entries(struct arpt_get_entries __user *uptr, int *len) +{ + int ret; + struct arpt_get_entries get; struct arpt_table *t; - t = xt_find_table_lock(NF_ARP, entries->name); + if (*len < sizeof(get)) { + duprintf("get_entries: %u < %Zu\n", *len, sizeof(get)); + return -EINVAL; + } + if (copy_from_user(&get, uptr, sizeof(get)) != 0) + return -EFAULT; + if (*len != sizeof(struct arpt_get_entries) + get.size) { + duprintf("get_entries: %u != %Zu\n", *len, + sizeof(struct arpt_get_entries) + get.size); + return -EINVAL; + } + + t = xt_find_table_lock(NF_ARP, get.name); if (t && !IS_ERR(t)) { struct xt_table_info *private = t->private; duprintf("t->private->number = %u\n", private->number); - if (entries->size == private->size) + if (get.size == private->size) ret = copy_entries_to_user(private->size, t, uptr->entrytable); else { duprintf("get_entries: I've got %u not %u!\n", - private->size, entries->size); + private->size, get.size); ret = -EINVAL; } module_put(t->me); @@ -794,71 +947,41 @@ static int get_entries(const struct arpt_get_entries *entries, return ret; } -static int do_replace(void __user *user, unsigned int len) +static int __do_replace(const char *name, unsigned int valid_hooks, + struct xt_table_info *newinfo, + unsigned int num_counters, + void __user *counters_ptr) { int ret; - struct arpt_replace tmp; struct arpt_table *t; - struct xt_table_info *newinfo, *oldinfo; + struct xt_table_info *oldinfo; struct xt_counters *counters; - void *loc_cpu_entry, *loc_cpu_old_entry; - - if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) - return -EFAULT; + void *loc_cpu_old_entry; - /* Hack: Causes ipchains to give correct error msg --RR */ - if (len != sizeof(tmp) + tmp.size) - return -ENOPROTOOPT; - - /* overflow check */ - if (tmp.size >= (INT_MAX - sizeof(struct xt_table_info)) / NR_CPUS - - SMP_CACHE_BYTES) - return -ENOMEM; - if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) - return -ENOMEM; - - newinfo = xt_alloc_table_info(tmp.size); - if (!newinfo) - return -ENOMEM; - - /* choose the copy that is on our node/cpu */ - loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; - if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), - tmp.size) != 0) { - ret = -EFAULT; - goto free_newinfo; - } - - counters = vmalloc(tmp.num_counters * sizeof(struct xt_counters)); + ret = 0; + counters = vmalloc_node(num_counters * sizeof(struct xt_counters), + numa_node_id()); if (!counters) { ret = -ENOMEM; - goto free_newinfo; + goto out; } - ret = translate_table(tmp.name, tmp.valid_hooks, - newinfo, loc_cpu_entry, tmp.size, tmp.num_entries, - tmp.hook_entry, tmp.underflow); - if (ret != 0) - goto free_newinfo_counters; - - duprintf("arp_tables: Translated table\n"); - - t = try_then_request_module(xt_find_table_lock(NF_ARP, tmp.name), - "arptable_%s", tmp.name); + t = try_then_request_module(xt_find_table_lock(NF_ARP, name), + "arptable_%s", name); if (!t || IS_ERR(t)) { ret = t ? PTR_ERR(t) : -ENOENT; goto free_newinfo_counters_untrans; } /* You lied! */ - if (tmp.valid_hooks != t->valid_hooks) { + if (valid_hooks != t->valid_hooks) { duprintf("Valid hook crap: %08X vs %08X\n", - tmp.valid_hooks, t->valid_hooks); + valid_hooks, t->valid_hooks); ret = -EINVAL; goto put_module; } - oldinfo = xt_replace_table(t, tmp.num_counters, newinfo, &ret); + oldinfo = xt_replace_table(t, num_counters, newinfo, &ret); if (!oldinfo) goto put_module; @@ -876,11 +999,12 @@ static int do_replace(void __user *user, unsigned int len) get_counters(oldinfo, counters); /* Decrease module usage counts and free resource */ loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; - ARPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,NULL); + ARPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry, + NULL); xt_free_table_info(oldinfo); - if (copy_to_user(tmp.counters, counters, - sizeof(struct xt_counters) * tmp.num_counters) != 0) + if (copy_to_user(counters_ptr, counters, + sizeof(struct xt_counters) * num_counters) != 0) ret = -EFAULT; vfree(counters); xt_table_unlock(t); @@ -890,9 +1014,53 @@ static int do_replace(void __user *user, unsigned int len) module_put(t->me); xt_table_unlock(t); free_newinfo_counters_untrans: - ARPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL); - free_newinfo_counters: vfree(counters); + out: + return ret; +} + +static int do_replace(void __user *user, unsigned int len) +{ + int ret; + struct arpt_replace tmp; + struct xt_table_info *newinfo; + void *loc_cpu_entry; + + if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) + return -EFAULT; + + /* overflow check */ + if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) + return -ENOMEM; + + newinfo = xt_alloc_table_info(tmp.size); + if (!newinfo) + return -ENOMEM; + + /* choose the copy that is on our node/cpu */ + loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), + tmp.size) != 0) { + ret = -EFAULT; + goto free_newinfo; + } + + ret = translate_table(tmp.name, tmp.valid_hooks, + newinfo, loc_cpu_entry, tmp.size, tmp.num_entries, + tmp.hook_entry, tmp.underflow); + if (ret != 0) + goto free_newinfo; + + duprintf("arp_tables: Translated table\n"); + + ret = __do_replace(tmp.name, tmp.valid_hooks, newinfo, + tmp.num_counters, tmp.counters); + if (ret) + goto free_newinfo_untrans; + return 0; + + free_newinfo_untrans: + ARPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL); free_newinfo: xt_free_table_info(newinfo); return ret; @@ -912,31 +1080,59 @@ static inline int add_counter_to_entry(struct arpt_entry *e, return 0; } -static int do_add_counters(void __user *user, unsigned int len) +static int do_add_counters(void __user *user, unsigned int len, int compat) { unsigned int i; - struct xt_counters_info tmp, *paddc; + struct xt_counters_info tmp; + struct xt_counters *paddc; + unsigned int num_counters; + char *name; + int size; + void *ptmp; struct arpt_table *t; struct xt_table_info *private; int ret = 0; void *loc_cpu_entry; +#ifdef CONFIG_COMPAT + struct compat_xt_counters_info compat_tmp; - if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) + if (compat) { + ptmp = &compat_tmp; + size = sizeof(struct compat_xt_counters_info); + } else +#endif + { + ptmp = &tmp; + size = sizeof(struct xt_counters_info); + } + + if (copy_from_user(ptmp, user, size) != 0) return -EFAULT; - if (len != sizeof(tmp) + tmp.num_counters*sizeof(struct xt_counters)) +#ifdef CONFIG_COMPAT + if (compat) { + num_counters = compat_tmp.num_counters; + name = compat_tmp.name; + } else +#endif + { + num_counters = tmp.num_counters; + name = tmp.name; + } + + if (len != size + num_counters * sizeof(struct xt_counters)) return -EINVAL; - paddc = vmalloc(len); + paddc = vmalloc_node(len - size, numa_node_id()); if (!paddc) return -ENOMEM; - if (copy_from_user(paddc, user, len) != 0) { + if (copy_from_user(paddc, user + size, len - size) != 0) { ret = -EFAULT; goto free; } - t = xt_find_table_lock(NF_ARP, tmp.name); + t = xt_find_table_lock(NF_ARP, name); if (!t || IS_ERR(t)) { ret = t ? PTR_ERR(t) : -ENOENT; goto free; @@ -944,7 +1140,7 @@ static int do_add_counters(void __user *user, unsigned int len) write_lock_bh(&t->lock); private = t->private; - if (private->number != tmp.num_counters) { + if (private->number != num_counters) { ret = -EINVAL; goto unlock_up_free; } @@ -955,7 +1151,7 @@ static int do_add_counters(void __user *user, unsigned int len) ARPT_ENTRY_ITERATE(loc_cpu_entry, private->size, add_counter_to_entry, - paddc->counters, + paddc, &i); unlock_up_free: write_unlock_bh(&t->lock); @@ -967,7 +1163,329 @@ static int do_add_counters(void __user *user, unsigned int len) return ret; } -static int do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) +#ifdef CONFIG_COMPAT +static inline int +compat_release_entry(struct compat_arpt_entry *e, unsigned int *i) +{ + struct arpt_entry_target *t; + + if (i && (*i)-- == 0) + return 1; + + t = compat_arpt_get_target(e); + module_put(t->u.kernel.target->me); + return 0; +} + +static inline int +check_compat_entry_size_and_hooks(struct compat_arpt_entry *e, + struct xt_table_info *newinfo, + unsigned int *size, + unsigned char *base, + unsigned char *limit, + unsigned int *hook_entries, + unsigned int *underflows, + unsigned int *i, + const char *name) +{ + struct arpt_entry_target *t; + struct xt_target *target; + unsigned int entry_offset; + int ret, off, h; + + duprintf("check_compat_entry_size_and_hooks %p\n", e); + if ((unsigned long)e % __alignof__(struct compat_arpt_entry) != 0 + || (unsigned char *)e + sizeof(struct compat_arpt_entry) >= limit) { + duprintf("Bad offset %p, limit = %p\n", e, limit); + return -EINVAL; + } + + if (e->next_offset < sizeof(struct compat_arpt_entry) + + sizeof(struct compat_xt_entry_target)) { + duprintf("checking: element %p size %u\n", + e, e->next_offset); + return -EINVAL; + } + + /* For purposes of check_entry casting the compat entry is fine */ + ret = check_entry((struct arpt_entry *)e, name); + if (ret) + return ret; + + off = sizeof(struct arpt_entry) - sizeof(struct compat_arpt_entry); + entry_offset = (void *)e - (void *)base; + + t = compat_arpt_get_target(e); + target = try_then_request_module(xt_find_target(NF_ARP, + t->u.user.name, + t->u.user.revision), + "arpt_%s", t->u.user.name); + if (IS_ERR(target) || !target) { + duprintf("check_compat_entry_size_and_hooks: `%s' not found\n", + t->u.user.name); + ret = target ? PTR_ERR(target) : -ENOENT; + goto out; + } + t->u.kernel.target = target; + + off += xt_compat_target_offset(target); + *size += off; + ret = xt_compat_add_offset(NF_ARP, entry_offset, off); + if (ret) + goto release_target; + + /* Check hooks & underflows */ + for (h = 0; h < NF_ARP_NUMHOOKS; h++) { + if ((unsigned char *)e - base == hook_entries[h]) + newinfo->hook_entry[h] = hook_entries[h]; + if ((unsigned char *)e - base == underflows[h]) + newinfo->underflow[h] = underflows[h]; + } + + /* Clear counters and comefrom */ + memset(&e->counters, 0, sizeof(e->counters)); + e->comefrom = 0; + + (*i)++; + return 0; + +release_target: + module_put(t->u.kernel.target->me); +out: + return ret; +} + +static int +compat_copy_entry_from_user(struct compat_arpt_entry *e, void **dstptr, + unsigned int *size, const char *name, + struct xt_table_info *newinfo, unsigned char *base) +{ + struct arpt_entry_target *t; + struct xt_target *target; + struct arpt_entry *de; + unsigned int origsize; + int ret, h; + + ret = 0; + origsize = *size; + de = (struct arpt_entry *)*dstptr; + memcpy(de, e, sizeof(struct arpt_entry)); + memcpy(&de->counters, &e->counters, sizeof(e->counters)); + + *dstptr += sizeof(struct arpt_entry); + *size += sizeof(struct arpt_entry) - sizeof(struct compat_arpt_entry); + + de->target_offset = e->target_offset - (origsize - *size); + t = compat_arpt_get_target(e); + target = t->u.kernel.target; + xt_compat_target_from_user(t, dstptr, size); + + de->next_offset = e->next_offset - (origsize - *size); + for (h = 0; h < NF_ARP_NUMHOOKS; h++) { + if ((unsigned char *)de - base < newinfo->hook_entry[h]) + newinfo->hook_entry[h] -= origsize - *size; + if ((unsigned char *)de - base < newinfo->underflow[h]) + newinfo->underflow[h] -= origsize - *size; + } + return ret; +} + +static inline int compat_check_entry(struct arpt_entry *e, const char *name, + unsigned int *i) +{ + int ret; + + ret = check_target(e, name); + if (ret) + return ret; + + (*i)++; + return 0; +} + +static int translate_compat_table(const char *name, + unsigned int valid_hooks, + struct xt_table_info **pinfo, + void **pentry0, + unsigned int total_size, + unsigned int number, + unsigned int *hook_entries, + unsigned int *underflows) +{ + unsigned int i, j; + struct xt_table_info *newinfo, *info; + void *pos, *entry0, *entry1; + unsigned int size; + int ret; + + info = *pinfo; + entry0 = *pentry0; + size = total_size; + info->number = number; + + /* Init all hooks to impossible value. */ + for (i = 0; i < NF_ARP_NUMHOOKS; i++) { + info->hook_entry[i] = 0xFFFFFFFF; + info->underflow[i] = 0xFFFFFFFF; + } + + duprintf("translate_compat_table: size %u\n", info->size); + j = 0; + xt_compat_lock(NF_ARP); + /* Walk through entries, checking offsets. */ + ret = COMPAT_ARPT_ENTRY_ITERATE(entry0, total_size, + check_compat_entry_size_and_hooks, + info, &size, entry0, + entry0 + total_size, + hook_entries, underflows, &j, name); + if (ret != 0) + goto out_unlock; + + ret = -EINVAL; + if (j != number) { + duprintf("translate_compat_table: %u not %u entries\n", + j, number); + goto out_unlock; + } + + /* Check hooks all assigned */ + for (i = 0; i < NF_ARP_NUMHOOKS; i++) { + /* Only hooks which are valid */ + if (!(valid_hooks & (1 << i))) + continue; + if (info->hook_entry[i] == 0xFFFFFFFF) { + duprintf("Invalid hook entry %u %u\n", + i, hook_entries[i]); + goto out_unlock; + } + if (info->underflow[i] == 0xFFFFFFFF) { + duprintf("Invalid underflow %u %u\n", + i, underflows[i]); + goto out_unlock; + } + } + + ret = -ENOMEM; + newinfo = xt_alloc_table_info(size); + if (!newinfo) + goto out_unlock; + + newinfo->number = number; + for (i = 0; i < NF_ARP_NUMHOOKS; i++) { + newinfo->hook_entry[i] = info->hook_entry[i]; + newinfo->underflow[i] = info->underflow[i]; + } + entry1 = newinfo->entries[raw_smp_processor_id()]; + pos = entry1; + size = total_size; + ret = COMPAT_ARPT_ENTRY_ITERATE(entry0, total_size, + compat_copy_entry_from_user, + &pos, &size, name, newinfo, entry1); + xt_compat_flush_offsets(NF_ARP); + xt_compat_unlock(NF_ARP); + if (ret) + goto free_newinfo; + + ret = -ELOOP; + if (!mark_source_chains(newinfo, valid_hooks, entry1)) + goto free_newinfo; + + i = 0; + ret = ARPT_ENTRY_ITERATE(entry1, newinfo->size, compat_check_entry, + name, &i); + if (ret) { + j -= i; + COMPAT_ARPT_ENTRY_ITERATE_CONTINUE(entry0, newinfo->size, i, + compat_release_entry, &j); + ARPT_ENTRY_ITERATE(entry1, newinfo->size, cleanup_entry, &i); + xt_free_table_info(newinfo); + return ret; + } + + /* And one copy for every other CPU */ + for_each_possible_cpu(i) + if (newinfo->entries[i] && newinfo->entries[i] != entry1) + memcpy(newinfo->entries[i], entry1, newinfo->size); + + *pinfo = newinfo; + *pentry0 = entry1; + xt_free_table_info(info); + return 0; + +free_newinfo: + xt_free_table_info(newinfo); +out: + COMPAT_ARPT_ENTRY_ITERATE(entry0, total_size, compat_release_entry, &j); + return ret; +out_unlock: + xt_compat_flush_offsets(NF_ARP); + xt_compat_unlock(NF_ARP); + goto out; +} + +struct compat_arpt_replace { + char name[ARPT_TABLE_MAXNAMELEN]; + u32 valid_hooks; + u32 num_entries; + u32 size; + u32 hook_entry[NF_ARP_NUMHOOKS]; + u32 underflow[NF_ARP_NUMHOOKS]; + u32 num_counters; + compat_uptr_t counters; + struct compat_arpt_entry entries[0]; +}; + +static int compat_do_replace(void __user *user, unsigned int len) +{ + int ret; + struct compat_arpt_replace tmp; + struct xt_table_info *newinfo; + void *loc_cpu_entry; + + if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) + return -EFAULT; + + /* overflow check */ + if (tmp.size >= INT_MAX / num_possible_cpus()) + return -ENOMEM; + if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) + return -ENOMEM; + + newinfo = xt_alloc_table_info(tmp.size); + if (!newinfo) + return -ENOMEM; + + /* choose the copy that is on our node/cpu */ + loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) { + ret = -EFAULT; + goto free_newinfo; + } + + ret = translate_compat_table(tmp.name, tmp.valid_hooks, + &newinfo, &loc_cpu_entry, tmp.size, + tmp.num_entries, tmp.hook_entry, + tmp.underflow); + if (ret != 0) + goto free_newinfo; + + duprintf("compat_do_replace: Translated table\n"); + + ret = __do_replace(tmp.name, tmp.valid_hooks, newinfo, + tmp.num_counters, compat_ptr(tmp.counters)); + if (ret) + goto free_newinfo_untrans; + return 0; + + free_newinfo_untrans: + ARPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL); + free_newinfo: + xt_free_table_info(newinfo); + return ret; +} + +static int compat_do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user, + unsigned int len) { int ret; @@ -976,11 +1494,11 @@ static int do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned switch (cmd) { case ARPT_SO_SET_REPLACE: - ret = do_replace(user, len); + ret = compat_do_replace(user, len); break; case ARPT_SO_SET_ADD_COUNTERS: - ret = do_add_counters(user, len); + ret = do_add_counters(user, len, 1); break; default: @@ -991,74 +1509,190 @@ static int do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned return ret; } -static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) +static int compat_copy_entry_to_user(struct arpt_entry *e, void __user **dstptr, + compat_uint_t *size, + struct xt_counters *counters, + unsigned int *i) { + struct arpt_entry_target *t; + struct compat_arpt_entry __user *ce; + u_int16_t target_offset, next_offset; + compat_uint_t origsize; int ret; - if (!capable(CAP_NET_ADMIN)) - return -EPERM; + ret = -EFAULT; + origsize = *size; + ce = (struct compat_arpt_entry __user *)*dstptr; + if (copy_to_user(ce, e, sizeof(struct arpt_entry))) + goto out; - switch (cmd) { - case ARPT_SO_GET_INFO: { - char name[ARPT_TABLE_MAXNAMELEN]; - struct arpt_table *t; + if (copy_to_user(&ce->counters, &counters[*i], sizeof(counters[*i]))) + goto out; + + *dstptr += sizeof(struct compat_arpt_entry); + *size -= sizeof(struct arpt_entry) - sizeof(struct compat_arpt_entry); - if (*len != sizeof(struct arpt_getinfo)) { - duprintf("length %u != %Zu\n", *len, - sizeof(struct arpt_getinfo)); + target_offset = e->target_offset - (origsize - *size); + + t = arpt_get_target(e); + ret = xt_compat_target_to_user(t, dstptr, size); + if (ret) + goto out; + ret = -EFAULT; + next_offset = e->next_offset - (origsize - *size); + if (put_user(target_offset, &ce->target_offset)) + goto out; + if (put_user(next_offset, &ce->next_offset)) + goto out; + + (*i)++; + return 0; +out: + return ret; +} + +static int compat_copy_entries_to_user(unsigned int total_size, + struct arpt_table *table, + void __user *userptr) +{ + struct xt_counters *counters; + struct xt_table_info *private = table->private; + void __user *pos; + unsigned int size; + int ret = 0; + void *loc_cpu_entry; + unsigned int i = 0; + + counters = alloc_counters(table); + if (IS_ERR(counters)) + return PTR_ERR(counters); + + /* choose the copy on our node/cpu */ + loc_cpu_entry = private->entries[raw_smp_processor_id()]; + pos = userptr; + size = total_size; + ret = ARPT_ENTRY_ITERATE(loc_cpu_entry, total_size, + compat_copy_entry_to_user, + &pos, &size, counters, &i); + vfree(counters); + return ret; +} + +struct compat_arpt_get_entries { + char name[ARPT_TABLE_MAXNAMELEN]; + compat_uint_t size; + struct compat_arpt_entry entrytable[0]; +}; + +static int compat_get_entries(struct compat_arpt_get_entries __user *uptr, + int *len) +{ + int ret; + struct compat_arpt_get_entries get; + struct arpt_table *t; + + if (*len < sizeof(get)) { + duprintf("compat_get_entries: %u < %zu\n", *len, sizeof(get)); + return -EINVAL; + } + if (copy_from_user(&get, uptr, sizeof(get)) != 0) + return -EFAULT; + if (*len != sizeof(struct compat_arpt_get_entries) + get.size) { + duprintf("compat_get_entries: %u != %zu\n", + *len, sizeof(get) + get.size); + return -EINVAL; + } + + xt_compat_lock(NF_ARP); + t = xt_find_table_lock(NF_ARP, get.name); + if (t && !IS_ERR(t)) { + struct xt_table_info *private = t->private; + struct xt_table_info info; + + duprintf("t->private->number = %u\n", private->number); + ret = compat_table_info(private, &info); + if (!ret && get.size == info.size) { + ret = compat_copy_entries_to_user(private->size, + t, uptr->entrytable); + } else if (!ret) { + duprintf("compat_get_entries: I've got %u not %u!\n", + private->size, get.size); ret = -EINVAL; - break; } + xt_compat_flush_offsets(NF_ARP); + module_put(t->me); + xt_table_unlock(t); + } else + ret = t ? PTR_ERR(t) : -ENOENT; - if (copy_from_user(name, user, sizeof(name)) != 0) { - ret = -EFAULT; - break; - } - name[ARPT_TABLE_MAXNAMELEN-1] = '\0'; - - t = try_then_request_module(xt_find_table_lock(NF_ARP, name), - "arptable_%s", name); - if (t && !IS_ERR(t)) { - struct arpt_getinfo info; - struct xt_table_info *private = t->private; - - info.valid_hooks = t->valid_hooks; - memcpy(info.hook_entry, private->hook_entry, - sizeof(info.hook_entry)); - memcpy(info.underflow, private->underflow, - sizeof(info.underflow)); - info.num_entries = private->number; - info.size = private->size; - strcpy(info.name, name); - - if (copy_to_user(user, &info, *len) != 0) - ret = -EFAULT; - else - ret = 0; - xt_table_unlock(t); - module_put(t->me); - } else - ret = t ? PTR_ERR(t) : -ENOENT; + xt_compat_unlock(NF_ARP); + return ret; +} + +static int do_arpt_get_ctl(struct sock *, int, void __user *, int *); + +static int compat_do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, + int *len) +{ + int ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + switch (cmd) { + case ARPT_SO_GET_INFO: + ret = get_info(user, len, 1); + break; + case ARPT_SO_GET_ENTRIES: + ret = compat_get_entries(user, len); + break; + default: + ret = do_arpt_get_ctl(sk, cmd, user, len); } - break; + return ret; +} +#endif - case ARPT_SO_GET_ENTRIES: { - struct arpt_get_entries get; +static int do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) +{ + int ret; - if (*len < sizeof(get)) { - duprintf("get_entries: %u < %Zu\n", *len, sizeof(get)); - ret = -EINVAL; - } else if (copy_from_user(&get, user, sizeof(get)) != 0) { - ret = -EFAULT; - } else if (*len != sizeof(struct arpt_get_entries) + get.size) { - duprintf("get_entries: %u != %Zu\n", *len, - sizeof(struct arpt_get_entries) + get.size); - ret = -EINVAL; - } else - ret = get_entries(&get, user); + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + switch (cmd) { + case ARPT_SO_SET_REPLACE: + ret = do_replace(user, len); + break; + + case ARPT_SO_SET_ADD_COUNTERS: + ret = do_add_counters(user, len, 0); break; + + default: + duprintf("do_arpt_set_ctl: unknown request %i\n", cmd); + ret = -EINVAL; } + return ret; +} + +static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) +{ + int ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + switch (cmd) { + case ARPT_SO_GET_INFO: + ret = get_info(user, len, 0); + break; + + case ARPT_SO_GET_ENTRIES: + ret = get_entries(user, len); + break; + case ARPT_SO_GET_REVISION_TARGET: { struct xt_get_revision rev; @@ -1090,7 +1724,7 @@ int arpt_register_table(struct arpt_table *table, { int ret; struct xt_table_info *newinfo; - static struct xt_table_info bootstrap + struct xt_table_info bootstrap = { 0, 0, 0, { 0 }, { 0 }, { } }; void *loc_cpu_entry; @@ -1144,6 +1778,11 @@ static struct arpt_target arpt_standard_target __read_mostly = { .name = ARPT_STANDARD_TARGET, .targetsize = sizeof(int), .family = NF_ARP, +#ifdef CONFIG_COMPAT + .compatsize = sizeof(compat_int_t), + .compat_from_user = compat_standard_from_user, + .compat_to_user = compat_standard_to_user, +#endif }; static struct arpt_target arpt_error_target __read_mostly = { @@ -1158,9 +1797,15 @@ static struct nf_sockopt_ops arpt_sockopts = { .set_optmin = ARPT_BASE_CTL, .set_optmax = ARPT_SO_SET_MAX+1, .set = do_arpt_set_ctl, +#ifdef CONFIG_COMPAT + .compat_set = compat_do_arpt_set_ctl, +#endif .get_optmin = ARPT_BASE_CTL, .get_optmax = ARPT_SO_GET_MAX+1, .get = do_arpt_get_ctl, +#ifdef CONFIG_COMPAT + .compat_get = compat_do_arpt_get_ctl, +#endif .owner = THIS_MODULE, }; diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c index 302d3da5f696..7201511d54d2 100644 --- a/net/ipv4/netfilter/arptable_filter.c +++ b/net/ipv4/netfilter/arptable_filter.c @@ -64,7 +64,7 @@ static unsigned int arpt_hook(unsigned int hook, return arpt_do_table(skb, hook, in, out, &packet_filter); } -static struct nf_hook_ops arpt_ops[] = { +static struct nf_hook_ops arpt_ops[] __read_mostly = { { .hook = arpt_hook, .owner = THIS_MODULE, diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c index 14d64a383db1..5109839da222 100644 --- a/net/ipv4/netfilter/ip_queue.c +++ b/net/ipv4/netfilter/ip_queue.c @@ -28,19 +28,15 @@ #include <net/net_namespace.h> #include <net/sock.h> #include <net/route.h> +#include <net/netfilter/nf_queue.h> +#include <net/ip.h> #define IPQ_QMAX_DEFAULT 1024 #define IPQ_PROC_FS_NAME "ip_queue" #define NET_IPQ_QMAX 2088 #define NET_IPQ_QMAX_NAME "ip_queue_maxlen" -struct ipq_queue_entry { - struct list_head list; - struct nf_info *info; - struct sk_buff *skb; -}; - -typedef int (*ipq_cmpfn)(struct ipq_queue_entry *, unsigned long); +typedef int (*ipq_cmpfn)(struct nf_queue_entry *, unsigned long); static unsigned char copy_mode __read_mostly = IPQ_COPY_NONE; static unsigned int queue_maxlen __read_mostly = IPQ_QMAX_DEFAULT; @@ -54,76 +50,13 @@ static struct sock *ipqnl __read_mostly; static LIST_HEAD(queue_list); static DEFINE_MUTEX(ipqnl_mutex); -static void -ipq_issue_verdict(struct ipq_queue_entry *entry, int verdict) -{ - /* TCP input path (and probably other bits) assume to be called - * from softirq context, not from syscall, like ipq_issue_verdict is - * called. TCP input path deadlocks with locks taken from timer - * softirq, e.g. We therefore emulate this by local_bh_disable() */ - - local_bh_disable(); - nf_reinject(entry->skb, entry->info, verdict); - local_bh_enable(); - - kfree(entry); -} - static inline void -__ipq_enqueue_entry(struct ipq_queue_entry *entry) +__ipq_enqueue_entry(struct nf_queue_entry *entry) { - list_add(&entry->list, &queue_list); + list_add_tail(&entry->list, &queue_list); queue_total++; } -/* - * Find and return a queued entry matched by cmpfn, or return the last - * entry if cmpfn is NULL. - */ -static inline struct ipq_queue_entry * -__ipq_find_entry(ipq_cmpfn cmpfn, unsigned long data) -{ - struct list_head *p; - - list_for_each_prev(p, &queue_list) { - struct ipq_queue_entry *entry = (struct ipq_queue_entry *)p; - - if (!cmpfn || cmpfn(entry, data)) - return entry; - } - return NULL; -} - -static inline void -__ipq_dequeue_entry(struct ipq_queue_entry *entry) -{ - list_del(&entry->list); - queue_total--; -} - -static inline struct ipq_queue_entry * -__ipq_find_dequeue_entry(ipq_cmpfn cmpfn, unsigned long data) -{ - struct ipq_queue_entry *entry; - - entry = __ipq_find_entry(cmpfn, data); - if (entry == NULL) - return NULL; - - __ipq_dequeue_entry(entry); - return entry; -} - - -static inline void -__ipq_flush(int verdict) -{ - struct ipq_queue_entry *entry; - - while ((entry = __ipq_find_dequeue_entry(NULL, 0))) - ipq_issue_verdict(entry, verdict); -} - static inline int __ipq_set_mode(unsigned char mode, unsigned int range) { @@ -150,36 +83,64 @@ __ipq_set_mode(unsigned char mode, unsigned int range) return status; } +static void __ipq_flush(ipq_cmpfn cmpfn, unsigned long data); + static inline void __ipq_reset(void) { peer_pid = 0; net_disable_timestamp(); __ipq_set_mode(IPQ_COPY_NONE, 0); - __ipq_flush(NF_DROP); + __ipq_flush(NULL, 0); } -static struct ipq_queue_entry * -ipq_find_dequeue_entry(ipq_cmpfn cmpfn, unsigned long data) +static struct nf_queue_entry * +ipq_find_dequeue_entry(unsigned long id) { - struct ipq_queue_entry *entry; + struct nf_queue_entry *entry = NULL, *i; write_lock_bh(&queue_lock); - entry = __ipq_find_dequeue_entry(cmpfn, data); + + list_for_each_entry(i, &queue_list, list) { + if ((unsigned long)i == id) { + entry = i; + break; + } + } + + if (entry) { + list_del(&entry->list); + queue_total--; + } + write_unlock_bh(&queue_lock); return entry; } static void -ipq_flush(int verdict) +__ipq_flush(ipq_cmpfn cmpfn, unsigned long data) +{ + struct nf_queue_entry *entry, *next; + + list_for_each_entry_safe(entry, next, &queue_list, list) { + if (!cmpfn || cmpfn(entry, data)) { + list_del(&entry->list); + queue_total--; + nf_reinject(entry, NF_DROP); + } + } +} + +static void +ipq_flush(ipq_cmpfn cmpfn, unsigned long data) { write_lock_bh(&queue_lock); - __ipq_flush(verdict); + __ipq_flush(cmpfn, data); write_unlock_bh(&queue_lock); } static struct sk_buff * -ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp) +ipq_build_packet_message(struct nf_queue_entry *entry, int *errp) { sk_buff_data_t old_tail; size_t size = 0; @@ -236,20 +197,20 @@ ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp) pmsg->timestamp_sec = tv.tv_sec; pmsg->timestamp_usec = tv.tv_usec; pmsg->mark = entry->skb->mark; - pmsg->hook = entry->info->hook; + pmsg->hook = entry->hook; pmsg->hw_protocol = entry->skb->protocol; - if (entry->info->indev) - strcpy(pmsg->indev_name, entry->info->indev->name); + if (entry->indev) + strcpy(pmsg->indev_name, entry->indev->name); else pmsg->indev_name[0] = '\0'; - if (entry->info->outdev) - strcpy(pmsg->outdev_name, entry->info->outdev->name); + if (entry->outdev) + strcpy(pmsg->outdev_name, entry->outdev->name); else pmsg->outdev_name[0] = '\0'; - if (entry->info->indev && entry->skb->dev) { + if (entry->indev && entry->skb->dev) { pmsg->hw_type = entry->skb->dev->type; pmsg->hw_addrlen = dev_parse_header(entry->skb, pmsg->hw_addr); @@ -271,28 +232,17 @@ nlmsg_failure: } static int -ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, - unsigned int queuenum, void *data) +ipq_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) { int status = -EINVAL; struct sk_buff *nskb; - struct ipq_queue_entry *entry; if (copy_mode == IPQ_COPY_NONE) return -EAGAIN; - entry = kmalloc(sizeof(*entry), GFP_ATOMIC); - if (entry == NULL) { - printk(KERN_ERR "ip_queue: OOM in ipq_enqueue_packet()\n"); - return -ENOMEM; - } - - entry->info = info; - entry->skb = skb; - nskb = ipq_build_packet_message(entry, &status); if (nskb == NULL) - goto err_out_free; + return status; write_lock_bh(&queue_lock); @@ -326,14 +276,11 @@ err_out_free_nskb: err_out_unlock: write_unlock_bh(&queue_lock); - -err_out_free: - kfree(entry); return status; } static int -ipq_mangle_ipv4(ipq_verdict_msg_t *v, struct ipq_queue_entry *e) +ipq_mangle_ipv4(ipq_verdict_msg_t *v, struct nf_queue_entry *e) { int diff; int err; @@ -368,21 +315,15 @@ ipq_mangle_ipv4(ipq_verdict_msg_t *v, struct ipq_queue_entry *e) return 0; } -static inline int -id_cmp(struct ipq_queue_entry *e, unsigned long id) -{ - return (id == (unsigned long )e); -} - static int ipq_set_verdict(struct ipq_verdict_msg *vmsg, unsigned int len) { - struct ipq_queue_entry *entry; + struct nf_queue_entry *entry; if (vmsg->value > NF_MAX_VERDICT) return -EINVAL; - entry = ipq_find_dequeue_entry(id_cmp, vmsg->id); + entry = ipq_find_dequeue_entry(vmsg->id); if (entry == NULL) return -ENOENT; else { @@ -392,7 +333,7 @@ ipq_set_verdict(struct ipq_verdict_msg *vmsg, unsigned int len) if (ipq_mangle_ipv4(vmsg, entry) < 0) verdict = NF_DROP; - ipq_issue_verdict(entry, verdict); + nf_reinject(entry, verdict); return 0; } } @@ -437,13 +378,13 @@ ipq_receive_peer(struct ipq_peer_msg *pmsg, } static int -dev_cmp(struct ipq_queue_entry *entry, unsigned long ifindex) +dev_cmp(struct nf_queue_entry *entry, unsigned long ifindex) { - if (entry->info->indev) - if (entry->info->indev->ifindex == ifindex) + if (entry->indev) + if (entry->indev->ifindex == ifindex) return 1; - if (entry->info->outdev) - if (entry->info->outdev->ifindex == ifindex) + if (entry->outdev) + if (entry->outdev->ifindex == ifindex) return 1; #ifdef CONFIG_BRIDGE_NETFILTER if (entry->skb->nf_bridge) { @@ -461,10 +402,7 @@ dev_cmp(struct ipq_queue_entry *entry, unsigned long ifindex) static void ipq_dev_drop(int ifindex) { - struct ipq_queue_entry *entry; - - while ((entry = ipq_find_dequeue_entry(dev_cmp, ifindex)) != NULL) - ipq_issue_verdict(entry, NF_DROP); + ipq_flush(dev_cmp, ifindex); } #define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0) @@ -588,26 +526,6 @@ static ctl_table ipq_table[] = { { .ctl_name = 0 } }; -static ctl_table ipq_dir_table[] = { - { - .ctl_name = NET_IPV4, - .procname = "ipv4", - .mode = 0555, - .child = ipq_table - }, - { .ctl_name = 0 } -}; - -static ctl_table ipq_root_table[] = { - { - .ctl_name = CTL_NET, - .procname = "net", - .mode = 0555, - .child = ipq_dir_table - }, - { .ctl_name = 0 } -}; - static int ip_queue_show(struct seq_file *m, void *v) { read_lock_bh(&queue_lock); @@ -645,7 +563,7 @@ static const struct file_operations ip_queue_proc_fops = { .owner = THIS_MODULE, }; -static struct nf_queue_handler nfqh = { +static const struct nf_queue_handler nfqh = { .name = "ip_queue", .outfn = &ipq_enqueue_packet, }; @@ -673,7 +591,7 @@ static int __init ip_queue_init(void) } register_netdevice_notifier(&ipq_dev_notifier); - ipq_sysctl_header = register_sysctl_table(ipq_root_table); + ipq_sysctl_header = register_sysctl_paths(net_ipv4_ctl_path, ipq_table); status = nf_register_queue_handler(PF_INET, &nfqh); if (status < 0) { @@ -687,7 +605,7 @@ cleanup_sysctl: unregister_netdevice_notifier(&ipq_dev_notifier); proc_net_remove(&init_net, IPQ_PROC_FS_NAME); cleanup_ipqnl: - sock_release(ipqnl->sk_socket); + netlink_kernel_release(ipqnl); mutex_lock(&ipqnl_mutex); mutex_unlock(&ipqnl_mutex); @@ -700,13 +618,13 @@ static void __exit ip_queue_fini(void) { nf_unregister_queue_handlers(&nfqh); synchronize_net(); - ipq_flush(NF_DROP); + ipq_flush(NULL, 0); unregister_sysctl_table(ipq_sysctl_header); unregister_netdevice_notifier(&ipq_dev_notifier); proc_net_remove(&init_net, IPQ_PROC_FS_NAME); - sock_release(ipqnl->sk_socket); + netlink_kernel_release(ipqnl); mutex_lock(&ipqnl_mutex); mutex_unlock(&ipqnl_mutex); diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index b9b189c26208..982b7f986291 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -26,6 +26,7 @@ #include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv4/ip_tables.h> +#include <net/netfilter/nf_log.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); @@ -74,7 +75,8 @@ do { \ Hence the start of any table is given by get_table() below. */ /* Returns whether matches rule or not. */ -static inline int +/* Performance critical - called for every packet */ +static inline bool ip_packet_match(const struct iphdr *ip, const char *indev, const char *outdev, @@ -84,7 +86,7 @@ ip_packet_match(const struct iphdr *ip, size_t i; unsigned long ret; -#define FWINV(bool,invflg) ((bool) ^ !!(ipinfo->invflags & invflg)) +#define FWINV(bool, invflg) ((bool) ^ !!(ipinfo->invflags & (invflg))) if (FWINV((ip->saddr&ipinfo->smsk.s_addr) != ipinfo->src.s_addr, IPT_INV_SRCIP) @@ -102,7 +104,7 @@ ip_packet_match(const struct iphdr *ip, NIPQUAD(ipinfo->dmsk.s_addr), NIPQUAD(ipinfo->dst.s_addr), ipinfo->invflags & IPT_INV_DSTIP ? " (INV)" : ""); - return 0; + return false; } /* Look for ifname matches; this should unroll nicely. */ @@ -116,7 +118,7 @@ ip_packet_match(const struct iphdr *ip, dprintf("VIA in mismatch (%s vs %s).%s\n", indev, ipinfo->iniface, ipinfo->invflags&IPT_INV_VIA_IN ?" (INV)":""); - return 0; + return false; } for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned long); i++) { @@ -129,7 +131,7 @@ ip_packet_match(const struct iphdr *ip, dprintf("VIA out mismatch (%s vs %s).%s\n", outdev, ipinfo->outiface, ipinfo->invflags&IPT_INV_VIA_OUT ?" (INV)":""); - return 0; + return false; } /* Check specific protocol */ @@ -138,7 +140,7 @@ ip_packet_match(const struct iphdr *ip, dprintf("Packet protocol %hi does not match %hi.%s\n", ip->protocol, ipinfo->proto, ipinfo->invflags&IPT_INV_PROTO ? " (INV)":""); - return 0; + return false; } /* If we have a fragment rule but the packet is not a fragment @@ -146,13 +148,13 @@ ip_packet_match(const struct iphdr *ip, if (FWINV((ipinfo->flags&IPT_F_FRAG) && !isfrag, IPT_INV_FRAG)) { dprintf("Fragment rule but not fragment.%s\n", ipinfo->invflags & IPT_INV_FRAG ? " (INV)" : ""); - return 0; + return false; } - return 1; + return true; } -static inline bool +static bool ip_checkentry(const struct ipt_ip *ip) { if (ip->flags & ~IPT_F_MASK) { @@ -182,8 +184,9 @@ ipt_error(struct sk_buff *skb, return NF_DROP; } -static inline -bool do_match(struct ipt_entry_match *m, +/* Performance critical - called for every packet */ +static inline bool +do_match(struct ipt_entry_match *m, const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, @@ -198,6 +201,7 @@ bool do_match(struct ipt_entry_match *m, return false; } +/* Performance critical */ static inline struct ipt_entry * get_entry(void *base, unsigned int offset) { @@ -205,6 +209,7 @@ get_entry(void *base, unsigned int offset) } /* All zeroes == unconditional rule. */ +/* Mildly perf critical (only if packet tracing is on) */ static inline int unconditional(const struct ipt_ip *ip) { @@ -215,16 +220,17 @@ unconditional(const struct ipt_ip *ip) return 0; return 1; +#undef FWINV } #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \ defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE) -static const char *hooknames[] = { - [NF_IP_PRE_ROUTING] = "PREROUTING", - [NF_IP_LOCAL_IN] = "INPUT", - [NF_IP_FORWARD] = "FORWARD", - [NF_IP_LOCAL_OUT] = "OUTPUT", - [NF_IP_POST_ROUTING] = "POSTROUTING", +static const char *const hooknames[] = { + [NF_INET_PRE_ROUTING] = "PREROUTING", + [NF_INET_LOCAL_IN] = "INPUT", + [NF_INET_FORWARD] = "FORWARD", + [NF_INET_LOCAL_OUT] = "OUTPUT", + [NF_INET_POST_ROUTING] = "POSTROUTING", }; enum nf_ip_trace_comments { @@ -233,7 +239,7 @@ enum nf_ip_trace_comments { NF_IP_TRACE_COMMENT_POLICY, }; -static const char *comments[] = { +static const char *const comments[] = { [NF_IP_TRACE_COMMENT_RULE] = "rule", [NF_IP_TRACE_COMMENT_RETURN] = "return", [NF_IP_TRACE_COMMENT_POLICY] = "policy", @@ -249,6 +255,7 @@ static struct nf_loginfo trace_loginfo = { }, }; +/* Mildly perf critical (only if packet tracing is on) */ static inline int get_chainname_rulenum(struct ipt_entry *s, struct ipt_entry *e, char *hookname, char **chainname, @@ -465,10 +472,9 @@ mark_source_chains(struct xt_table_info *newinfo, /* No recursion; use packet counter to save back ptrs (reset to 0 as we leave), and comefrom to save source hook bitmask */ - for (hook = 0; hook < NF_IP_NUMHOOKS; hook++) { + for (hook = 0; hook < NF_INET_NUMHOOKS; hook++) { unsigned int pos = newinfo->hook_entry[hook]; - struct ipt_entry *e - = (struct ipt_entry *)(entry0 + pos); + struct ipt_entry *e = (struct ipt_entry *)(entry0 + pos); if (!(valid_hooks & (1 << hook))) continue; @@ -481,13 +487,12 @@ mark_source_chains(struct xt_table_info *newinfo, = (void *)ipt_get_target(e); int visited = e->comefrom & (1 << hook); - if (e->comefrom & (1 << NF_IP_NUMHOOKS)) { + if (e->comefrom & (1 << NF_INET_NUMHOOKS)) { printk("iptables: loop hook %u pos %u %08X.\n", hook, pos, e->comefrom); return 0; } - e->comefrom - |= ((1 << hook) | (1 << NF_IP_NUMHOOKS)); + e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS)); /* Unconditional return/END. */ if ((e->target_offset == sizeof(struct ipt_entry) @@ -507,10 +512,10 @@ mark_source_chains(struct xt_table_info *newinfo, /* Return: backtrack through the last big jump. */ do { - e->comefrom ^= (1<<NF_IP_NUMHOOKS); + e->comefrom ^= (1<<NF_INET_NUMHOOKS); #ifdef DEBUG_IP_FIREWALL_USER if (e->comefrom - & (1 << NF_IP_NUMHOOKS)) { + & (1 << NF_INET_NUMHOOKS)) { duprintf("Back unset " "on hook %u " "rule %u\n", @@ -567,7 +572,7 @@ mark_source_chains(struct xt_table_info *newinfo, return 1; } -static inline int +static int cleanup_match(struct ipt_entry_match *m, unsigned int *i) { if (i && (*i)-- == 0) @@ -579,7 +584,7 @@ cleanup_match(struct ipt_entry_match *m, unsigned int *i) return 0; } -static inline int +static int check_entry(struct ipt_entry *e, const char *name) { struct ipt_entry_target *t; @@ -589,7 +594,8 @@ check_entry(struct ipt_entry *e, const char *name) return -EINVAL; } - if (e->target_offset + sizeof(struct ipt_entry_target) > e->next_offset) + if (e->target_offset + sizeof(struct ipt_entry_target) > + e->next_offset) return -EINVAL; t = ipt_get_target(e); @@ -599,9 +605,10 @@ check_entry(struct ipt_entry *e, const char *name) return 0; } -static inline int check_match(struct ipt_entry_match *m, const char *name, - const struct ipt_ip *ip, unsigned int hookmask, - unsigned int *i) +static int +check_match(struct ipt_entry_match *m, const char *name, + const struct ipt_ip *ip, + unsigned int hookmask, unsigned int *i) { struct xt_match *match; int ret; @@ -622,18 +629,18 @@ static inline int check_match(struct ipt_entry_match *m, const char *name, return ret; } -static inline int +static int find_check_match(struct ipt_entry_match *m, - const char *name, - const struct ipt_ip *ip, - unsigned int hookmask, - unsigned int *i) + const char *name, + const struct ipt_ip *ip, + unsigned int hookmask, + unsigned int *i) { struct xt_match *match; int ret; match = try_then_request_module(xt_find_match(AF_INET, m->u.user.name, - m->u.user.revision), + m->u.user.revision), "ipt_%s", m->u.user.name); if (IS_ERR(match) || !match) { duprintf("find_check_match: `%s' not found\n", m->u.user.name); @@ -651,7 +658,7 @@ err: return ret; } -static inline int check_target(struct ipt_entry *e, const char *name) +static int check_target(struct ipt_entry *e, const char *name) { struct ipt_entry_target *t; struct xt_target *target; @@ -663,8 +670,8 @@ static inline int check_target(struct ipt_entry *e, const char *name) name, e->comefrom, e->ip.proto, e->ip.invflags & IPT_INV_PROTO); if (!ret && t->u.kernel.target->checkentry - && !t->u.kernel.target->checkentry(name, e, target, - t->data, e->comefrom)) { + && !t->u.kernel.target->checkentry(name, e, target, t->data, + e->comefrom)) { duprintf("ip_tables: check failed for `%s'.\n", t->u.kernel.target->name); ret = -EINVAL; @@ -672,9 +679,9 @@ static inline int check_target(struct ipt_entry *e, const char *name) return ret; } -static inline int +static int find_check_entry(struct ipt_entry *e, const char *name, unsigned int size, - unsigned int *i) + unsigned int *i) { struct ipt_entry_target *t; struct xt_target *target; @@ -687,14 +694,14 @@ find_check_entry(struct ipt_entry *e, const char *name, unsigned int size, j = 0; ret = IPT_MATCH_ITERATE(e, find_check_match, name, &e->ip, - e->comefrom, &j); + e->comefrom, &j); if (ret != 0) goto cleanup_matches; t = ipt_get_target(e); target = try_then_request_module(xt_find_target(AF_INET, - t->u.user.name, - t->u.user.revision), + t->u.user.name, + t->u.user.revision), "ipt_%s", t->u.user.name); if (IS_ERR(target) || !target) { duprintf("find_check_entry: `%s' not found\n", t->u.user.name); @@ -716,7 +723,7 @@ find_check_entry(struct ipt_entry *e, const char *name, unsigned int size, return ret; } -static inline int +static int check_entry_size_and_hooks(struct ipt_entry *e, struct xt_table_info *newinfo, unsigned char *base, @@ -741,7 +748,7 @@ check_entry_size_and_hooks(struct ipt_entry *e, } /* Check hooks & underflows */ - for (h = 0; h < NF_IP_NUMHOOKS; h++) { + for (h = 0; h < NF_INET_NUMHOOKS; h++) { if ((unsigned char *)e - base == hook_entries[h]) newinfo->hook_entry[h] = hook_entries[h]; if ((unsigned char *)e - base == underflows[h]) @@ -759,7 +766,7 @@ check_entry_size_and_hooks(struct ipt_entry *e, return 0; } -static inline int +static int cleanup_entry(struct ipt_entry *e, unsigned int *i) { struct ipt_entry_target *t; @@ -795,7 +802,7 @@ translate_table(const char *name, newinfo->number = number; /* Init all hooks to impossible value. */ - for (i = 0; i < NF_IP_NUMHOOKS; i++) { + for (i = 0; i < NF_INET_NUMHOOKS; i++) { newinfo->hook_entry[i] = 0xFFFFFFFF; newinfo->underflow[i] = 0xFFFFFFFF; } @@ -819,7 +826,7 @@ translate_table(const char *name, } /* Check hooks all assigned */ - for (i = 0; i < NF_IP_NUMHOOKS; i++) { + for (i = 0; i < NF_INET_NUMHOOKS; i++) { /* Only hooks which are valid */ if (!(valid_hooks & (1 << i))) continue; @@ -915,7 +922,7 @@ get_counters(const struct xt_table_info *t, } } -static inline struct xt_counters * alloc_counters(struct xt_table *table) +static struct xt_counters * alloc_counters(struct xt_table *table) { unsigned int countersize; struct xt_counters *counters; @@ -959,7 +966,6 @@ copy_entries_to_user(unsigned int total_size, * allowed to migrate to another cpu) */ loc_cpu_entry = private->entries[raw_smp_processor_id()]; - /* ... then copy entire thing ... */ if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) { ret = -EFAULT; goto free_counters; @@ -1014,63 +1020,12 @@ copy_entries_to_user(unsigned int total_size, } #ifdef CONFIG_COMPAT -struct compat_delta { - struct compat_delta *next; - unsigned int offset; - short delta; -}; - -static struct compat_delta *compat_offsets = NULL; - -static int compat_add_offset(unsigned int offset, short delta) -{ - struct compat_delta *tmp; - - tmp = kmalloc(sizeof(struct compat_delta), GFP_KERNEL); - if (!tmp) - return -ENOMEM; - tmp->offset = offset; - tmp->delta = delta; - if (compat_offsets) { - tmp->next = compat_offsets->next; - compat_offsets->next = tmp; - } else { - compat_offsets = tmp; - tmp->next = NULL; - } - return 0; -} - -static void compat_flush_offsets(void) -{ - struct compat_delta *tmp, *next; - - if (compat_offsets) { - for(tmp = compat_offsets; tmp; tmp = next) { - next = tmp->next; - kfree(tmp); - } - compat_offsets = NULL; - } -} - -static short compat_calc_jump(unsigned int offset) -{ - struct compat_delta *tmp; - short delta; - - for(tmp = compat_offsets, delta = 0; tmp; tmp = tmp->next) - if (tmp->offset < offset) - delta += tmp->delta; - return delta; -} - static void compat_standard_from_user(void *dst, void *src) { int v = *(compat_int_t *)src; if (v > 0) - v += compat_calc_jump(v); + v += xt_compat_calc_jump(AF_INET, v); memcpy(dst, &v, sizeof(v)); } @@ -1079,64 +1034,61 @@ static int compat_standard_to_user(void __user *dst, void *src) compat_int_t cv = *(int *)src; if (cv > 0) - cv -= compat_calc_jump(cv); + cv -= xt_compat_calc_jump(AF_INET, cv); return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0; } static inline int -compat_calc_match(struct ipt_entry_match *m, int * size) +compat_calc_match(struct ipt_entry_match *m, int *size) { *size += xt_compat_match_offset(m->u.kernel.match); return 0; } -static int compat_calc_entry(struct ipt_entry *e, struct xt_table_info *info, - void *base, struct xt_table_info *newinfo) +static int compat_calc_entry(struct ipt_entry *e, + const struct xt_table_info *info, + void *base, struct xt_table_info *newinfo) { struct ipt_entry_target *t; unsigned int entry_offset; int off, i, ret; - off = 0; + off = sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry); entry_offset = (void *)e - base; IPT_MATCH_ITERATE(e, compat_calc_match, &off); t = ipt_get_target(e); off += xt_compat_target_offset(t->u.kernel.target); newinfo->size -= off; - ret = compat_add_offset(entry_offset, off); + ret = xt_compat_add_offset(AF_INET, entry_offset, off); if (ret) return ret; - for (i = 0; i< NF_IP_NUMHOOKS; i++) { - if (info->hook_entry[i] && (e < (struct ipt_entry *) - (base + info->hook_entry[i]))) + for (i = 0; i < NF_INET_NUMHOOKS; i++) { + if (info->hook_entry[i] && + (e < (struct ipt_entry *)(base + info->hook_entry[i]))) newinfo->hook_entry[i] -= off; - if (info->underflow[i] && (e < (struct ipt_entry *) - (base + info->underflow[i]))) + if (info->underflow[i] && + (e < (struct ipt_entry *)(base + info->underflow[i]))) newinfo->underflow[i] -= off; } return 0; } -static int compat_table_info(struct xt_table_info *info, - struct xt_table_info *newinfo) +static int compat_table_info(const struct xt_table_info *info, + struct xt_table_info *newinfo) { void *loc_cpu_entry; - int i; if (!newinfo || !info) return -EINVAL; - memset(newinfo, 0, sizeof(struct xt_table_info)); - newinfo->size = info->size; - newinfo->number = info->number; - for (i = 0; i < NF_IP_NUMHOOKS; i++) { - newinfo->hook_entry[i] = info->hook_entry[i]; - newinfo->underflow[i] = info->underflow[i]; - } + /* we dont care about newinfo->entries[] */ + memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); + newinfo->initial_entries = 0; loc_cpu_entry = info->entries[raw_smp_processor_id()]; return IPT_ENTRY_ITERATE(loc_cpu_entry, info->size, - compat_calc_entry, info, loc_cpu_entry, newinfo); + compat_calc_entry, info, loc_cpu_entry, + newinfo); } #endif @@ -1147,8 +1099,8 @@ static int get_info(void __user *user, int *len, int compat) int ret; if (*len != sizeof(struct ipt_getinfo)) { - duprintf("length %u != %u\n", *len, - (unsigned int)sizeof(struct ipt_getinfo)); + duprintf("length %u != %zu\n", *len, + sizeof(struct ipt_getinfo)); return -EINVAL; } @@ -1161,7 +1113,7 @@ static int get_info(void __user *user, int *len, int compat) xt_compat_lock(AF_INET); #endif t = try_then_request_module(xt_find_table_lock(AF_INET, name), - "iptable_%s", name); + "iptable_%s", name); if (t && !IS_ERR(t)) { struct ipt_getinfo info; struct xt_table_info *private = t->private; @@ -1170,15 +1122,15 @@ static int get_info(void __user *user, int *len, int compat) if (compat) { struct xt_table_info tmp; ret = compat_table_info(private, &tmp); - compat_flush_offsets(); - private = &tmp; + xt_compat_flush_offsets(AF_INET); + private = &tmp; } #endif info.valid_hooks = t->valid_hooks; memcpy(info.hook_entry, private->hook_entry, - sizeof(info.hook_entry)); + sizeof(info.hook_entry)); memcpy(info.underflow, private->underflow, - sizeof(info.underflow)); + sizeof(info.underflow)); info.num_entries = private->number; info.size = private->size; strcpy(info.name, name); @@ -1207,31 +1159,27 @@ get_entries(struct ipt_get_entries __user *uptr, int *len) struct xt_table *t; if (*len < sizeof(get)) { - duprintf("get_entries: %u < %d\n", *len, - (unsigned int)sizeof(get)); + duprintf("get_entries: %u < %zu\n", *len, sizeof(get)); return -EINVAL; } if (copy_from_user(&get, uptr, sizeof(get)) != 0) return -EFAULT; if (*len != sizeof(struct ipt_get_entries) + get.size) { - duprintf("get_entries: %u != %u\n", *len, - (unsigned int)(sizeof(struct ipt_get_entries) + - get.size)); + duprintf("get_entries: %u != %zu\n", + *len, sizeof(get) + get.size); return -EINVAL; } t = xt_find_table_lock(AF_INET, get.name); if (t && !IS_ERR(t)) { struct xt_table_info *private = t->private; - duprintf("t->private->number = %u\n", - private->number); + duprintf("t->private->number = %u\n", private->number); if (get.size == private->size) ret = copy_entries_to_user(private->size, t, uptr->entrytable); else { duprintf("get_entries: I've got %u not %u!\n", - private->size, - get.size); + private->size, get.size); ret = -EINVAL; } module_put(t->me); @@ -1244,8 +1192,8 @@ get_entries(struct ipt_get_entries __user *uptr, int *len) static int __do_replace(const char *name, unsigned int valid_hooks, - struct xt_table_info *newinfo, unsigned int num_counters, - void __user *counters_ptr) + struct xt_table_info *newinfo, unsigned int num_counters, + void __user *counters_ptr) { int ret; struct xt_table *t; @@ -1293,7 +1241,8 @@ __do_replace(const char *name, unsigned int valid_hooks, get_counters(oldinfo, counters); /* Decrease module usage counts and free resource */ loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; - IPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,NULL); + IPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry, + NULL); xt_free_table_info(oldinfo); if (copy_to_user(counters_ptr, counters, sizeof(struct xt_counters) * num_counters) != 0) @@ -1322,14 +1271,7 @@ do_replace(void __user *user, unsigned int len) if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) return -EFAULT; - /* Hack: Causes ipchains to give correct error msg --RR */ - if (len != sizeof(tmp) + tmp.size) - return -ENOPROTOOPT; - /* overflow check */ - if (tmp.size >= (INT_MAX - sizeof(struct xt_table_info)) / NR_CPUS - - SMP_CACHE_BYTES) - return -ENOMEM; if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) return -ENOMEM; @@ -1337,7 +1279,7 @@ do_replace(void __user *user, unsigned int len) if (!newinfo) return -ENOMEM; - /* choose the copy that is our node/cpu */ + /* choose the copy that is on our node/cpu */ loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) { @@ -1353,15 +1295,14 @@ do_replace(void __user *user, unsigned int len) duprintf("ip_tables: Translated table\n"); - ret = __do_replace(tmp.name, tmp.valid_hooks, - newinfo, tmp.num_counters, - tmp.counters); + ret = __do_replace(tmp.name, tmp.valid_hooks, newinfo, + tmp.num_counters, tmp.counters); if (ret) goto free_newinfo_untrans; return 0; free_newinfo_untrans: - IPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry,NULL); + IPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL); free_newinfo: xt_free_table_info(newinfo); return ret; @@ -1369,7 +1310,7 @@ do_replace(void __user *user, unsigned int len) /* We're lazy, and add to the first CPU; overflow works its fey magic * and everything is OK. */ -static inline int +static int add_counter_to_entry(struct ipt_entry *e, const struct xt_counters addme[], unsigned int *i) @@ -1479,19 +1420,13 @@ struct compat_ipt_replace { u32 valid_hooks; u32 num_entries; u32 size; - u32 hook_entry[NF_IP_NUMHOOKS]; - u32 underflow[NF_IP_NUMHOOKS]; + u32 hook_entry[NF_INET_NUMHOOKS]; + u32 underflow[NF_INET_NUMHOOKS]; u32 num_counters; compat_uptr_t counters; /* struct ipt_counters * */ struct compat_ipt_entry entries[0]; }; -static inline int compat_copy_match_to_user(struct ipt_entry_match *m, - void __user **dstptr, compat_uint_t *size) -{ - return xt_compat_match_to_user(m, dstptr, size); -} - static int compat_copy_entry_to_user(struct ipt_entry *e, void __user **dstptr, compat_uint_t *size, struct xt_counters *counters, @@ -1513,7 +1448,9 @@ compat_copy_entry_to_user(struct ipt_entry *e, void __user **dstptr, goto out; *dstptr += sizeof(struct compat_ipt_entry); - ret = IPT_MATCH_ITERATE(e, compat_copy_match_to_user, dstptr, size); + *size -= sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry); + + ret = IPT_MATCH_ITERATE(e, xt_compat_match_to_user, dstptr, size); target_offset = e->target_offset - (origsize - *size); if (ret) goto out; @@ -1534,21 +1471,21 @@ out: return ret; } -static inline int +static int compat_find_calc_match(struct ipt_entry_match *m, - const char *name, - const struct ipt_ip *ip, - unsigned int hookmask, - int *size, int *i) + const char *name, + const struct ipt_ip *ip, + unsigned int hookmask, + int *size, int *i) { struct xt_match *match; match = try_then_request_module(xt_find_match(AF_INET, m->u.user.name, - m->u.user.revision), + m->u.user.revision), "ipt_%s", m->u.user.name); if (IS_ERR(match) || !match) { duprintf("compat_check_calc_match: `%s' not found\n", - m->u.user.name); + m->u.user.name); return match ? PTR_ERR(match) : -ENOENT; } m->u.kernel.match = match; @@ -1558,7 +1495,7 @@ compat_find_calc_match(struct ipt_entry_match *m, return 0; } -static inline int +static int compat_release_match(struct ipt_entry_match *m, unsigned int *i) { if (i && (*i)-- == 0) @@ -1568,8 +1505,8 @@ compat_release_match(struct ipt_entry_match *m, unsigned int *i) return 0; } -static inline int -compat_release_entry(struct ipt_entry *e, unsigned int *i) +static int +compat_release_entry(struct compat_ipt_entry *e, unsigned int *i) { struct ipt_entry_target *t; @@ -1577,22 +1514,22 @@ compat_release_entry(struct ipt_entry *e, unsigned int *i) return 1; /* Cleanup all matches */ - IPT_MATCH_ITERATE(e, compat_release_match, NULL); - t = ipt_get_target(e); + COMPAT_IPT_MATCH_ITERATE(e, compat_release_match, NULL); + t = compat_ipt_get_target(e); module_put(t->u.kernel.target->me); return 0; } -static inline int -check_compat_entry_size_and_hooks(struct ipt_entry *e, - struct xt_table_info *newinfo, - unsigned int *size, - unsigned char *base, - unsigned char *limit, - unsigned int *hook_entries, - unsigned int *underflows, - unsigned int *i, - const char *name) +static int +check_compat_entry_size_and_hooks(struct compat_ipt_entry *e, + struct xt_table_info *newinfo, + unsigned int *size, + unsigned char *base, + unsigned char *limit, + unsigned int *hook_entries, + unsigned int *underflows, + unsigned int *i, + const char *name) { struct ipt_entry_target *t; struct xt_target *target; @@ -1607,32 +1544,33 @@ check_compat_entry_size_and_hooks(struct ipt_entry *e, } if (e->next_offset < sizeof(struct compat_ipt_entry) + - sizeof(struct compat_xt_entry_target)) { + sizeof(struct compat_xt_entry_target)) { duprintf("checking: element %p size %u\n", e, e->next_offset); return -EINVAL; } - ret = check_entry(e, name); + /* For purposes of check_entry casting the compat entry is fine */ + ret = check_entry((struct ipt_entry *)e, name); if (ret) return ret; - off = 0; + off = sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry); entry_offset = (void *)e - (void *)base; j = 0; - ret = IPT_MATCH_ITERATE(e, compat_find_calc_match, name, &e->ip, - e->comefrom, &off, &j); + ret = COMPAT_IPT_MATCH_ITERATE(e, compat_find_calc_match, name, + &e->ip, e->comefrom, &off, &j); if (ret != 0) goto release_matches; - t = ipt_get_target(e); + t = compat_ipt_get_target(e); target = try_then_request_module(xt_find_target(AF_INET, - t->u.user.name, - t->u.user.revision), + t->u.user.name, + t->u.user.revision), "ipt_%s", t->u.user.name); if (IS_ERR(target) || !target) { duprintf("check_compat_entry_size_and_hooks: `%s' not found\n", - t->u.user.name); + t->u.user.name); ret = target ? PTR_ERR(target) : -ENOENT; goto release_matches; } @@ -1640,12 +1578,12 @@ check_compat_entry_size_and_hooks(struct ipt_entry *e, off += xt_compat_target_offset(target); *size += off; - ret = compat_add_offset(entry_offset, off); + ret = xt_compat_add_offset(AF_INET, entry_offset, off); if (ret) goto out; /* Check hooks & underflows */ - for (h = 0; h < NF_IP_NUMHOOKS; h++) { + for (h = 0; h < NF_INET_NUMHOOKS; h++) { if ((unsigned char *)e - base == hook_entries[h]) newinfo->hook_entry[h] = hook_entries[h]; if ((unsigned char *)e - base == underflows[h]) @@ -1653,7 +1591,7 @@ check_compat_entry_size_and_hooks(struct ipt_entry *e, } /* Clear counters and comefrom */ - e->counters = ((struct ipt_counters) { 0, 0 }); + memset(&e->counters, 0, sizeof(e->counters)); e->comefrom = 0; (*i)++; @@ -1666,17 +1604,10 @@ release_matches: return ret; } -static inline int compat_copy_match_from_user(struct ipt_entry_match *m, - void **dstptr, compat_uint_t *size, const char *name, - const struct ipt_ip *ip, unsigned int hookmask) -{ - xt_compat_match_from_user(m, dstptr, size); - return 0; -} - -static int compat_copy_entry_from_user(struct ipt_entry *e, void **dstptr, - unsigned int *size, const char *name, - struct xt_table_info *newinfo, unsigned char *base) +static int +compat_copy_entry_from_user(struct compat_ipt_entry *e, void **dstptr, + unsigned int *size, const char *name, + struct xt_table_info *newinfo, unsigned char *base) { struct ipt_entry_target *t; struct xt_target *target; @@ -1688,19 +1619,22 @@ static int compat_copy_entry_from_user(struct ipt_entry *e, void **dstptr, origsize = *size; de = (struct ipt_entry *)*dstptr; memcpy(de, e, sizeof(struct ipt_entry)); + memcpy(&de->counters, &e->counters, sizeof(e->counters)); - *dstptr += sizeof(struct compat_ipt_entry); - ret = IPT_MATCH_ITERATE(e, compat_copy_match_from_user, dstptr, size, - name, &de->ip, de->comefrom); + *dstptr += sizeof(struct ipt_entry); + *size += sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry); + + ret = COMPAT_IPT_MATCH_ITERATE(e, xt_compat_match_from_user, + dstptr, size); if (ret) return ret; de->target_offset = e->target_offset - (origsize - *size); - t = ipt_get_target(e); + t = compat_ipt_get_target(e); target = t->u.kernel.target; xt_compat_target_from_user(t, dstptr, size); de->next_offset = e->next_offset - (origsize - *size); - for (h = 0; h < NF_IP_NUMHOOKS; h++) { + for (h = 0; h < NF_INET_NUMHOOKS; h++) { if ((unsigned char *)de - base < newinfo->hook_entry[h]) newinfo->hook_entry[h] -= origsize - *size; if ((unsigned char *)de - base < newinfo->underflow[h]) @@ -1709,13 +1643,15 @@ static int compat_copy_entry_from_user(struct ipt_entry *e, void **dstptr, return ret; } -static inline int compat_check_entry(struct ipt_entry *e, const char *name, - unsigned int *i) +static int +compat_check_entry(struct ipt_entry *e, const char *name, + unsigned int *i) { int j, ret; j = 0; - ret = IPT_MATCH_ITERATE(e, check_match, name, &e->ip, e->comefrom, &j); + ret = IPT_MATCH_ITERATE(e, check_match, name, &e->ip, + e->comefrom, &j); if (ret) goto cleanup_matches; @@ -1733,13 +1669,13 @@ static inline int compat_check_entry(struct ipt_entry *e, const char *name, static int translate_compat_table(const char *name, - unsigned int valid_hooks, - struct xt_table_info **pinfo, - void **pentry0, - unsigned int total_size, - unsigned int number, - unsigned int *hook_entries, - unsigned int *underflows) + unsigned int valid_hooks, + struct xt_table_info **pinfo, + void **pentry0, + unsigned int total_size, + unsigned int number, + unsigned int *hook_entries, + unsigned int *underflows) { unsigned int i, j; struct xt_table_info *newinfo, *info; @@ -1753,7 +1689,7 @@ translate_compat_table(const char *name, info->number = number; /* Init all hooks to impossible value. */ - for (i = 0; i < NF_IP_NUMHOOKS; i++) { + for (i = 0; i < NF_INET_NUMHOOKS; i++) { info->hook_entry[i] = 0xFFFFFFFF; info->underflow[i] = 0xFFFFFFFF; } @@ -1762,11 +1698,11 @@ translate_compat_table(const char *name, j = 0; xt_compat_lock(AF_INET); /* Walk through entries, checking offsets. */ - ret = IPT_ENTRY_ITERATE(entry0, total_size, - check_compat_entry_size_and_hooks, - info, &size, entry0, - entry0 + total_size, - hook_entries, underflows, &j, name); + ret = COMPAT_IPT_ENTRY_ITERATE(entry0, total_size, + check_compat_entry_size_and_hooks, + info, &size, entry0, + entry0 + total_size, + hook_entries, underflows, &j, name); if (ret != 0) goto out_unlock; @@ -1778,7 +1714,7 @@ translate_compat_table(const char *name, } /* Check hooks all assigned */ - for (i = 0; i < NF_IP_NUMHOOKS; i++) { + for (i = 0; i < NF_INET_NUMHOOKS; i++) { /* Only hooks which are valid */ if (!(valid_hooks & (1 << i))) continue; @@ -1800,17 +1736,17 @@ translate_compat_table(const char *name, goto out_unlock; newinfo->number = number; - for (i = 0; i < NF_IP_NUMHOOKS; i++) { + for (i = 0; i < NF_INET_NUMHOOKS; i++) { newinfo->hook_entry[i] = info->hook_entry[i]; newinfo->underflow[i] = info->underflow[i]; } entry1 = newinfo->entries[raw_smp_processor_id()]; pos = entry1; - size = total_size; - ret = IPT_ENTRY_ITERATE(entry0, total_size, - compat_copy_entry_from_user, &pos, &size, - name, newinfo, entry1); - compat_flush_offsets(); + size = total_size; + ret = COMPAT_IPT_ENTRY_ITERATE(entry0, total_size, + compat_copy_entry_from_user, + &pos, &size, name, newinfo, entry1); + xt_compat_flush_offsets(AF_INET); xt_compat_unlock(AF_INET); if (ret) goto free_newinfo; @@ -1821,11 +1757,11 @@ translate_compat_table(const char *name, i = 0; ret = IPT_ENTRY_ITERATE(entry1, newinfo->size, compat_check_entry, - name, &i); + name, &i); if (ret) { j -= i; - IPT_ENTRY_ITERATE_CONTINUE(entry1, newinfo->size, i, - compat_release_entry, &j); + COMPAT_IPT_ENTRY_ITERATE_CONTINUE(entry0, newinfo->size, i, + compat_release_entry, &j); IPT_ENTRY_ITERATE(entry1, newinfo->size, cleanup_entry, &i); xt_free_table_info(newinfo); return ret; @@ -1844,10 +1780,10 @@ translate_compat_table(const char *name, free_newinfo: xt_free_table_info(newinfo); out: - IPT_ENTRY_ITERATE(entry0, total_size, compat_release_entry, &j); + COMPAT_IPT_ENTRY_ITERATE(entry0, total_size, compat_release_entry, &j); return ret; out_unlock: - compat_flush_offsets(); + xt_compat_flush_offsets(AF_INET); xt_compat_unlock(AF_INET); goto out; } @@ -1863,13 +1799,8 @@ compat_do_replace(void __user *user, unsigned int len) if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) return -EFAULT; - /* Hack: Causes ipchains to give correct error msg --RR */ - if (len != sizeof(tmp) + tmp.size) - return -ENOPROTOOPT; - /* overflow check */ - if (tmp.size >= (INT_MAX - sizeof(struct xt_table_info)) / NR_CPUS - - SMP_CACHE_BYTES) + if (tmp.size >= INT_MAX / num_possible_cpus()) return -ENOMEM; if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) return -ENOMEM; @@ -1878,7 +1809,7 @@ compat_do_replace(void __user *user, unsigned int len) if (!newinfo) return -ENOMEM; - /* choose the copy that is our node/cpu */ + /* choose the copy that is on our node/cpu */ loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) { @@ -1887,22 +1818,22 @@ compat_do_replace(void __user *user, unsigned int len) } ret = translate_compat_table(tmp.name, tmp.valid_hooks, - &newinfo, &loc_cpu_entry, tmp.size, - tmp.num_entries, tmp.hook_entry, tmp.underflow); + &newinfo, &loc_cpu_entry, tmp.size, + tmp.num_entries, tmp.hook_entry, + tmp.underflow); if (ret != 0) goto free_newinfo; duprintf("compat_do_replace: Translated table\n"); - ret = __do_replace(tmp.name, tmp.valid_hooks, - newinfo, tmp.num_counters, - compat_ptr(tmp.counters)); + ret = __do_replace(tmp.name, tmp.valid_hooks, newinfo, + tmp.num_counters, compat_ptr(tmp.counters)); if (ret) goto free_newinfo_untrans; return 0; free_newinfo_untrans: - IPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry,NULL); + IPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL); free_newinfo: xt_free_table_info(newinfo); return ret; @@ -1910,7 +1841,7 @@ compat_do_replace(void __user *user, unsigned int len) static int compat_do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, - unsigned int len) + unsigned int len) { int ret; @@ -1934,15 +1865,15 @@ compat_do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, return ret; } -struct compat_ipt_get_entries -{ +struct compat_ipt_get_entries { char name[IPT_TABLE_MAXNAMELEN]; compat_uint_t size; struct compat_ipt_entry entrytable[0]; }; -static int compat_copy_entries_to_user(unsigned int total_size, - struct xt_table *table, void __user *userptr) +static int +compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table, + void __user *userptr) { struct xt_counters *counters; struct xt_table_info *private = table->private; @@ -1978,10 +1909,8 @@ compat_get_entries(struct compat_ipt_get_entries __user *uptr, int *len) struct compat_ipt_get_entries get; struct xt_table *t; - if (*len < sizeof(get)) { - duprintf("compat_get_entries: %u < %u\n", - *len, (unsigned int)sizeof(get)); + duprintf("compat_get_entries: %u < %zu\n", *len, sizeof(get)); return -EINVAL; } @@ -1989,9 +1918,8 @@ compat_get_entries(struct compat_ipt_get_entries __user *uptr, int *len) return -EFAULT; if (*len != sizeof(struct compat_ipt_get_entries) + get.size) { - duprintf("compat_get_entries: %u != %u\n", *len, - (unsigned int)(sizeof(struct compat_ipt_get_entries) + - get.size)); + duprintf("compat_get_entries: %u != %zu\n", + *len, sizeof(get) + get.size); return -EINVAL; } @@ -2000,19 +1928,17 @@ compat_get_entries(struct compat_ipt_get_entries __user *uptr, int *len) if (t && !IS_ERR(t)) { struct xt_table_info *private = t->private; struct xt_table_info info; - duprintf("t->private->number = %u\n", - private->number); + duprintf("t->private->number = %u\n", private->number); ret = compat_table_info(private, &info); if (!ret && get.size == info.size) { ret = compat_copy_entries_to_user(private->size, - t, uptr->entrytable); + t, uptr->entrytable); } else if (!ret) { duprintf("compat_get_entries: I've got %u not %u!\n", - private->size, - get.size); + private->size, get.size); ret = -EINVAL; } - compat_flush_offsets(); + xt_compat_flush_offsets(AF_INET); module_put(t->me); xt_table_unlock(t); } else @@ -2047,7 +1973,7 @@ compat_do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) #endif static int -do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) +do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) { int ret; @@ -2126,7 +2052,7 @@ int ipt_register_table(struct xt_table *table, const struct ipt_replace *repl) { int ret; struct xt_table_info *newinfo; - static struct xt_table_info bootstrap + struct xt_table_info bootstrap = { 0, 0, 0, { 0 }, { 0 }, { } }; void *loc_cpu_entry; @@ -2134,9 +2060,7 @@ int ipt_register_table(struct xt_table *table, const struct ipt_replace *repl) if (!newinfo) return -ENOMEM; - /* choose the copy on our node/cpu - * but dont care of preemption - */ + /* choose the copy on our node/cpu, but dont care about preemption */ loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; memcpy(loc_cpu_entry, repl->entries, repl->size); @@ -2178,7 +2102,8 @@ icmp_type_code_match(u_int8_t test_type, u_int8_t min_code, u_int8_t max_code, u_int8_t type, u_int8_t code, bool invert) { - return ((test_type == 0xFF) || (type == test_type && code >= min_code && code <= max_code)) + return ((test_type == 0xFF) || + (type == test_type && code >= min_code && code <= max_code)) ^ invert; } @@ -2219,7 +2144,7 @@ icmp_match(const struct sk_buff *skb, /* Called when user tries to insert an entry of this type. */ static bool icmp_checkentry(const char *tablename, - const void *info, + const void *entry, const struct xt_match *match, void *matchinfo, unsigned int hook_mask) @@ -2270,9 +2195,9 @@ static struct xt_match icmp_matchstruct __read_mostly = { .name = "icmp", .match = icmp_match, .matchsize = sizeof(struct ipt_icmp), + .checkentry = icmp_checkentry, .proto = IPPROTO_ICMP, .family = AF_INET, - .checkentry = icmp_checkentry, }; static int __init ip_tables_init(void) diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 2f544dac72df..1b31f7d14d46 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -32,7 +32,7 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); -MODULE_DESCRIPTION("iptables target for CLUSTERIP"); +MODULE_DESCRIPTION("Xtables: CLUSTERIP target"); struct clusterip_config { struct list_head list; /* list of all configs */ @@ -109,11 +109,9 @@ clusterip_config_entry_put(struct clusterip_config *c) static struct clusterip_config * __clusterip_config_find(__be32 clusterip) { - struct list_head *pos; + struct clusterip_config *c; - list_for_each(pos, &clusterip_configs) { - struct clusterip_config *c = list_entry(pos, - struct clusterip_config, list); + list_for_each_entry(c, &clusterip_configs, list) { if (c->clusterip == clusterip) return c; } @@ -275,7 +273,7 @@ clusterip_hashfn(const struct sk_buff *skb, } /* node numbers are 1..n, not 0..n */ - return (hashval % config->num_total_nodes) + 1; + return (((u64)hashval * config->num_total_nodes) >> 32) + 1; } static inline int @@ -289,12 +287,9 @@ clusterip_responsible(const struct clusterip_config *config, u_int32_t hash) ***********************************************************************/ static unsigned int -target(struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) +clusterip_tg(struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, unsigned int hooknum, + const struct xt_target *target, const void *targinfo) { const struct ipt_clusterip_tgt_info *cipinfo = targinfo; struct nf_conn *ct; @@ -361,11 +356,9 @@ target(struct sk_buff *skb, } static bool -checkentry(const char *tablename, - const void *e_void, - const struct xt_target *target, - void *targinfo, - unsigned int hook_mask) +clusterip_tg_check(const char *tablename, const void *e_void, + const struct xt_target *target, void *targinfo, + unsigned int hook_mask) { struct ipt_clusterip_tgt_info *cipinfo = targinfo; const struct ipt_entry *e = e_void; @@ -421,7 +414,7 @@ checkentry(const char *tablename, if (nf_ct_l3proto_try_module_get(target->family) < 0) { printk(KERN_WARNING "can't load conntrack support for " - "proto=%d\n", target->family); + "proto=%u\n", target->family); return false; } @@ -429,7 +422,7 @@ checkentry(const char *tablename, } /* drop reference count of cluster config when rule is deleted */ -static void destroy(const struct xt_target *target, void *targinfo) +static void clusterip_tg_destroy(const struct xt_target *target, void *targinfo) { struct ipt_clusterip_tgt_info *cipinfo = targinfo; @@ -456,12 +449,12 @@ struct compat_ipt_clusterip_tgt_info }; #endif /* CONFIG_COMPAT */ -static struct xt_target clusterip_tgt __read_mostly = { +static struct xt_target clusterip_tg_reg __read_mostly = { .name = "CLUSTERIP", .family = AF_INET, - .target = target, - .checkentry = checkentry, - .destroy = destroy, + .target = clusterip_tg, + .checkentry = clusterip_tg_check, + .destroy = clusterip_tg_destroy, .targetsize = sizeof(struct ipt_clusterip_tgt_info), #ifdef CONFIG_COMPAT .compatsize = sizeof(struct compat_ipt_clusterip_tgt_info), @@ -558,7 +551,7 @@ arp_mangle(unsigned int hook, return NF_ACCEPT; } -static struct nf_hook_ops cip_arp_ops = { +static struct nf_hook_ops cip_arp_ops __read_mostly = { .hook = arp_mangle, .pf = NF_ARP, .hooknum = NF_ARP_OUT, @@ -714,11 +707,11 @@ static const struct file_operations clusterip_proc_fops = { #endif /* CONFIG_PROC_FS */ -static int __init ipt_clusterip_init(void) +static int __init clusterip_tg_init(void) { int ret; - ret = xt_register_target(&clusterip_tgt); + ret = xt_register_target(&clusterip_tg_reg); if (ret < 0) return ret; @@ -744,11 +737,11 @@ cleanup_hook: nf_unregister_hook(&cip_arp_ops); #endif /* CONFIG_PROC_FS */ cleanup_target: - xt_unregister_target(&clusterip_tgt); + xt_unregister_target(&clusterip_tg_reg); return ret; } -static void __exit ipt_clusterip_fini(void) +static void __exit clusterip_tg_exit(void) { printk(KERN_NOTICE "ClusterIP Version %s unloading\n", CLUSTERIP_VERSION); @@ -756,8 +749,8 @@ static void __exit ipt_clusterip_fini(void) remove_proc_entry(clusterip_procdir->name, clusterip_procdir->parent); #endif nf_unregister_hook(&cip_arp_ops); - xt_unregister_target(&clusterip_tgt); + xt_unregister_target(&clusterip_tg_reg); } -module_init(ipt_clusterip_init); -module_exit(ipt_clusterip_fini); +module_init(clusterip_tg_init); +module_exit(clusterip_tg_exit); diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c index add110060a22..21395bc2b27f 100644 --- a/net/ipv4/netfilter/ipt_ECN.c +++ b/net/ipv4/netfilter/ipt_ECN.c @@ -21,7 +21,7 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); -MODULE_DESCRIPTION("iptables ECN modification module"); +MODULE_DESCRIPTION("Xtables: Explicit Congestion Notification (ECN) flag modification"); /* set ECT codepoint from IP header. * return false if there was an error. */ @@ -38,7 +38,7 @@ set_ect_ip(struct sk_buff *skb, const struct ipt_ECN_info *einfo) oldtos = iph->tos; iph->tos &= ~IPT_ECN_IP_MASK; iph->tos |= (einfo->ip_ect & IPT_ECN_IP_MASK); - nf_csum_replace2(&iph->check, htons(oldtos), htons(iph->tos)); + csum_replace2(&iph->check, htons(oldtos), htons(iph->tos)); } return true; } @@ -71,18 +71,15 @@ set_ect_tcp(struct sk_buff *skb, const struct ipt_ECN_info *einfo) if (einfo->operation & IPT_ECN_OP_SET_CWR) tcph->cwr = einfo->proto.tcp.cwr; - nf_proto_csum_replace2(&tcph->check, skb, - oldval, ((__be16 *)tcph)[6], 0); + inet_proto_csum_replace2(&tcph->check, skb, + oldval, ((__be16 *)tcph)[6], 0); return true; } static unsigned int -target(struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) +ecn_tg(struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, unsigned int hooknum, + const struct xt_target *target, const void *targinfo) { const struct ipt_ECN_info *einfo = targinfo; @@ -99,11 +96,9 @@ target(struct sk_buff *skb, } static bool -checkentry(const char *tablename, - const void *e_void, - const struct xt_target *target, - void *targinfo, - unsigned int hook_mask) +ecn_tg_check(const char *tablename, const void *e_void, + const struct xt_target *target, void *targinfo, + unsigned int hook_mask) { const struct ipt_ECN_info *einfo = (struct ipt_ECN_info *)targinfo; const struct ipt_entry *e = e_void; @@ -127,25 +122,25 @@ checkentry(const char *tablename, return true; } -static struct xt_target ipt_ecn_reg __read_mostly = { +static struct xt_target ecn_tg_reg __read_mostly = { .name = "ECN", .family = AF_INET, - .target = target, + .target = ecn_tg, .targetsize = sizeof(struct ipt_ECN_info), .table = "mangle", - .checkentry = checkentry, + .checkentry = ecn_tg_check, .me = THIS_MODULE, }; -static int __init ipt_ecn_init(void) +static int __init ecn_tg_init(void) { - return xt_register_target(&ipt_ecn_reg); + return xt_register_target(&ecn_tg_reg); } -static void __exit ipt_ecn_fini(void) +static void __exit ecn_tg_exit(void) { - xt_unregister_target(&ipt_ecn_reg); + xt_unregister_target(&ecn_tg_reg); } -module_init(ipt_ecn_init); -module_exit(ipt_ecn_fini); +module_init(ecn_tg_init); +module_exit(ecn_tg_exit); diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c index 4b5e8216a4e7..b38d7850f506 100644 --- a/net/ipv4/netfilter/ipt_LOG.c +++ b/net/ipv4/netfilter/ipt_LOG.c @@ -22,10 +22,11 @@ #include <linux/netfilter.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv4/ipt_LOG.h> +#include <net/netfilter/nf_log.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); -MODULE_DESCRIPTION("iptables syslog logging module"); +MODULE_DESCRIPTION("Xtables: IPv4 packet logging to syslog"); /* Use lock to serialize, so printks don't overlap */ static DEFINE_SPINLOCK(log_lock); @@ -337,7 +338,9 @@ static void dump_packet(const struct nf_loginfo *info, if ((logflags & IPT_LOG_UID) && !iphoff && skb->sk) { read_lock_bh(&skb->sk->sk_callback_lock); if (skb->sk->sk_socket && skb->sk->sk_socket->file) - printk("UID=%u ", skb->sk->sk_socket->file->f_uid); + printk("UID=%u GID=%u", + skb->sk->sk_socket->file->f_uid, + skb->sk->sk_socket->file->f_gid); read_unlock_bh(&skb->sk->sk_callback_lock); } @@ -418,12 +421,9 @@ ipt_log_packet(unsigned int pf, } static unsigned int -ipt_log_target(struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) +log_tg(struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, unsigned int hooknum, + const struct xt_target *target, const void *targinfo) { const struct ipt_log_info *loginfo = targinfo; struct nf_loginfo li; @@ -437,11 +437,10 @@ ipt_log_target(struct sk_buff *skb, return XT_CONTINUE; } -static bool ipt_log_checkentry(const char *tablename, - const void *e, - const struct xt_target *target, - void *targinfo, - unsigned int hook_mask) +static bool +log_tg_check(const char *tablename, const void *e, + const struct xt_target *target, void *targinfo, + unsigned int hook_mask) { const struct ipt_log_info *loginfo = targinfo; @@ -457,37 +456,37 @@ static bool ipt_log_checkentry(const char *tablename, return true; } -static struct xt_target ipt_log_reg __read_mostly = { +static struct xt_target log_tg_reg __read_mostly = { .name = "LOG", .family = AF_INET, - .target = ipt_log_target, + .target = log_tg, .targetsize = sizeof(struct ipt_log_info), - .checkentry = ipt_log_checkentry, + .checkentry = log_tg_check, .me = THIS_MODULE, }; -static struct nf_logger ipt_log_logger ={ +static const struct nf_logger ipt_log_logger ={ .name = "ipt_LOG", .logfn = &ipt_log_packet, .me = THIS_MODULE, }; -static int __init ipt_log_init(void) +static int __init log_tg_init(void) { int ret; - ret = xt_register_target(&ipt_log_reg); + ret = xt_register_target(&log_tg_reg); if (ret < 0) return ret; nf_log_register(PF_INET, &ipt_log_logger); return 0; } -static void __exit ipt_log_fini(void) +static void __exit log_tg_exit(void) { nf_log_unregister(&ipt_log_logger); - xt_unregister_target(&ipt_log_reg); + xt_unregister_target(&log_tg_reg); } -module_init(ipt_log_init); -module_exit(ipt_log_fini); +module_init(log_tg_init); +module_exit(log_tg_exit); diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c index 44b516e7cb79..d80fee8327e4 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c @@ -25,18 +25,16 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); -MODULE_DESCRIPTION("iptables MASQUERADE target module"); +MODULE_DESCRIPTION("Xtables: automatic-address SNAT"); /* Lock protects masq region inside conntrack */ static DEFINE_RWLOCK(masq_lock); /* FIXME: Multiple targets. --RR */ static bool -masquerade_check(const char *tablename, - const void *e, - const struct xt_target *target, - void *targinfo, - unsigned int hook_mask) +masquerade_tg_check(const char *tablename, const void *e, + const struct xt_target *target, void *targinfo, + unsigned int hook_mask) { const struct nf_nat_multi_range_compat *mr = targinfo; @@ -52,12 +50,9 @@ masquerade_check(const char *tablename, } static unsigned int -masquerade_target(struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) +masquerade_tg(struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, unsigned int hooknum, + const struct xt_target *target, const void *targinfo) { struct nf_conn *ct; struct nf_conn_nat *nat; @@ -67,7 +62,7 @@ masquerade_target(struct sk_buff *skb, const struct rtable *rt; __be32 newsrc; - NF_CT_ASSERT(hooknum == NF_IP_POST_ROUTING); + NF_CT_ASSERT(hooknum == NF_INET_POST_ROUTING); ct = nf_ct_get(skb, &ctinfo); nat = nfct_nat(ct); @@ -100,7 +95,7 @@ masquerade_target(struct sk_buff *skb, mr->range[0].min, mr->range[0].max }); /* Hand modified range to generic setup. */ - return nf_nat_setup_info(ct, &newrange, hooknum); + return nf_nat_setup_info(ct, &newrange, IP_NAT_MANIP_SRC); } static int @@ -166,22 +161,22 @@ static struct notifier_block masq_inet_notifier = { .notifier_call = masq_inet_event, }; -static struct xt_target masquerade __read_mostly = { +static struct xt_target masquerade_tg_reg __read_mostly = { .name = "MASQUERADE", .family = AF_INET, - .target = masquerade_target, + .target = masquerade_tg, .targetsize = sizeof(struct nf_nat_multi_range_compat), .table = "nat", - .hooks = 1 << NF_IP_POST_ROUTING, - .checkentry = masquerade_check, + .hooks = 1 << NF_INET_POST_ROUTING, + .checkentry = masquerade_tg_check, .me = THIS_MODULE, }; -static int __init ipt_masquerade_init(void) +static int __init masquerade_tg_init(void) { int ret; - ret = xt_register_target(&masquerade); + ret = xt_register_target(&masquerade_tg_reg); if (ret == 0) { /* Register for device down reports */ @@ -193,12 +188,12 @@ static int __init ipt_masquerade_init(void) return ret; } -static void __exit ipt_masquerade_fini(void) +static void __exit masquerade_tg_exit(void) { - xt_unregister_target(&masquerade); + xt_unregister_target(&masquerade_tg_reg); unregister_netdevice_notifier(&masq_dev_notifier); unregister_inetaddr_notifier(&masq_inet_notifier); } -module_init(ipt_masquerade_init); -module_exit(ipt_masquerade_fini); +module_init(masquerade_tg_init); +module_exit(masquerade_tg_exit); diff --git a/net/ipv4/netfilter/ipt_NETMAP.c b/net/ipv4/netfilter/ipt_NETMAP.c index f8699291e33d..6739abfd1521 100644 --- a/net/ipv4/netfilter/ipt_NETMAP.c +++ b/net/ipv4/netfilter/ipt_NETMAP.c @@ -20,14 +20,12 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Svenning Soerensen <svenning@post5.tele.dk>"); -MODULE_DESCRIPTION("iptables 1:1 NAT mapping of IP networks target"); +MODULE_DESCRIPTION("Xtables: 1:1 NAT mapping of IPv4 subnets"); static bool -check(const char *tablename, - const void *e, - const struct xt_target *target, - void *targinfo, - unsigned int hook_mask) +netmap_tg_check(const char *tablename, const void *e, + const struct xt_target *target, void *targinfo, + unsigned int hook_mask) { const struct nf_nat_multi_range_compat *mr = targinfo; @@ -43,12 +41,9 @@ check(const char *tablename, } static unsigned int -target(struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) +netmap_tg(struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, unsigned int hooknum, + const struct xt_target *target, const void *targinfo) { struct nf_conn *ct; enum ip_conntrack_info ctinfo; @@ -56,14 +51,14 @@ target(struct sk_buff *skb, const struct nf_nat_multi_range_compat *mr = targinfo; struct nf_nat_range newrange; - NF_CT_ASSERT(hooknum == NF_IP_PRE_ROUTING - || hooknum == NF_IP_POST_ROUTING - || hooknum == NF_IP_LOCAL_OUT); + NF_CT_ASSERT(hooknum == NF_INET_PRE_ROUTING + || hooknum == NF_INET_POST_ROUTING + || hooknum == NF_INET_LOCAL_OUT); ct = nf_ct_get(skb, &ctinfo); netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip); - if (hooknum == NF_IP_PRE_ROUTING || hooknum == NF_IP_LOCAL_OUT) + if (hooknum == NF_INET_PRE_ROUTING || hooknum == NF_INET_LOCAL_OUT) new_ip = ip_hdr(skb)->daddr & ~netmask; else new_ip = ip_hdr(skb)->saddr & ~netmask; @@ -75,30 +70,31 @@ target(struct sk_buff *skb, mr->range[0].min, mr->range[0].max }); /* Hand modified range to generic setup. */ - return nf_nat_setup_info(ct, &newrange, hooknum); + return nf_nat_setup_info(ct, &newrange, HOOK2MANIP(hooknum)); } -static struct xt_target target_module __read_mostly = { +static struct xt_target netmap_tg_reg __read_mostly = { .name = "NETMAP", .family = AF_INET, - .target = target, + .target = netmap_tg, .targetsize = sizeof(struct nf_nat_multi_range_compat), .table = "nat", - .hooks = (1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_POST_ROUTING) | - (1 << NF_IP_LOCAL_OUT), - .checkentry = check, + .hooks = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_POST_ROUTING) | + (1 << NF_INET_LOCAL_OUT), + .checkentry = netmap_tg_check, .me = THIS_MODULE }; -static int __init ipt_netmap_init(void) +static int __init netmap_tg_init(void) { - return xt_register_target(&target_module); + return xt_register_target(&netmap_tg_reg); } -static void __exit ipt_netmap_fini(void) +static void __exit netmap_tg_exit(void) { - xt_unregister_target(&target_module); + xt_unregister_target(&netmap_tg_reg); } -module_init(ipt_netmap_init); -module_exit(ipt_netmap_fini); +module_init(netmap_tg_init); +module_exit(netmap_tg_exit); diff --git a/net/ipv4/netfilter/ipt_REDIRECT.c b/net/ipv4/netfilter/ipt_REDIRECT.c index f7cf7d61a2d4..5c6292449d13 100644 --- a/net/ipv4/netfilter/ipt_REDIRECT.c +++ b/net/ipv4/netfilter/ipt_REDIRECT.c @@ -23,15 +23,13 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); -MODULE_DESCRIPTION("iptables REDIRECT target module"); +MODULE_DESCRIPTION("Xtables: Connection redirection to localhost"); /* FIXME: Take multiple ranges --RR */ static bool -redirect_check(const char *tablename, - const void *e, - const struct xt_target *target, - void *targinfo, - unsigned int hook_mask) +redirect_tg_check(const char *tablename, const void *e, + const struct xt_target *target, void *targinfo, + unsigned int hook_mask) { const struct nf_nat_multi_range_compat *mr = targinfo; @@ -47,12 +45,9 @@ redirect_check(const char *tablename, } static unsigned int -redirect_target(struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) +redirect_tg(struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, unsigned int hooknum, + const struct xt_target *target, const void *targinfo) { struct nf_conn *ct; enum ip_conntrack_info ctinfo; @@ -60,14 +55,14 @@ redirect_target(struct sk_buff *skb, const struct nf_nat_multi_range_compat *mr = targinfo; struct nf_nat_range newrange; - NF_CT_ASSERT(hooknum == NF_IP_PRE_ROUTING - || hooknum == NF_IP_LOCAL_OUT); + NF_CT_ASSERT(hooknum == NF_INET_PRE_ROUTING + || hooknum == NF_INET_LOCAL_OUT); ct = nf_ct_get(skb, &ctinfo); NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)); /* Local packets: make them go to loopback */ - if (hooknum == NF_IP_LOCAL_OUT) + if (hooknum == NF_INET_LOCAL_OUT) newdst = htonl(0x7F000001); else { struct in_device *indev; @@ -92,29 +87,29 @@ redirect_target(struct sk_buff *skb, mr->range[0].min, mr->range[0].max }); /* Hand modified range to generic setup. */ - return nf_nat_setup_info(ct, &newrange, hooknum); + return nf_nat_setup_info(ct, &newrange, IP_NAT_MANIP_DST); } -static struct xt_target redirect_reg __read_mostly = { +static struct xt_target redirect_tg_reg __read_mostly = { .name = "REDIRECT", .family = AF_INET, - .target = redirect_target, + .target = redirect_tg, .targetsize = sizeof(struct nf_nat_multi_range_compat), .table = "nat", - .hooks = (1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_OUT), - .checkentry = redirect_check, + .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT), + .checkentry = redirect_tg_check, .me = THIS_MODULE, }; -static int __init ipt_redirect_init(void) +static int __init redirect_tg_init(void) { - return xt_register_target(&redirect_reg); + return xt_register_target(&redirect_tg_reg); } -static void __exit ipt_redirect_fini(void) +static void __exit redirect_tg_exit(void) { - xt_unregister_target(&redirect_reg); + xt_unregister_target(&redirect_tg_reg); } -module_init(ipt_redirect_init); -module_exit(ipt_redirect_fini); +module_init(redirect_tg_init); +module_exit(redirect_tg_exit); diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c index dcf4d21d5116..22606e2baa16 100644 --- a/net/ipv4/netfilter/ipt_REJECT.c +++ b/net/ipv4/netfilter/ipt_REJECT.c @@ -29,17 +29,14 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); -MODULE_DESCRIPTION("iptables REJECT target module"); +MODULE_DESCRIPTION("Xtables: packet \"rejection\" target for IPv4"); /* Send RST reply */ static void send_reset(struct sk_buff *oldskb, int hook) { struct sk_buff *nskb; - struct iphdr *niph; + struct iphdr *oiph, *niph; struct tcphdr _otcph, *oth, *tcph; - __be16 tmp_port; - __be32 tmp_addr; - int needs_ack; unsigned int addr_type; /* IP header checks: fragment. */ @@ -58,99 +55,73 @@ static void send_reset(struct sk_buff *oldskb, int hook) /* Check checksum */ if (nf_ip_checksum(oldskb, hook, ip_hdrlen(oldskb), IPPROTO_TCP)) return; + oiph = ip_hdr(oldskb); - /* We need a linear, writeable skb. We also need to expand - headroom in case hh_len of incoming interface < hh_len of - outgoing interface */ - nskb = skb_copy_expand(oldskb, LL_MAX_HEADER, skb_tailroom(oldskb), - GFP_ATOMIC); + nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct tcphdr) + + LL_MAX_HEADER, GFP_ATOMIC); if (!nskb) return; - /* This packet will not be the same as the other: clear nf fields */ - nf_reset(nskb); - nskb->mark = 0; - skb_init_secmark(nskb); - - skb_shinfo(nskb)->gso_size = 0; - skb_shinfo(nskb)->gso_segs = 0; - skb_shinfo(nskb)->gso_type = 0; - - tcph = (struct tcphdr *)(skb_network_header(nskb) + ip_hdrlen(nskb)); - - /* Swap source and dest */ - niph = ip_hdr(nskb); - tmp_addr = niph->saddr; - niph->saddr = niph->daddr; - niph->daddr = tmp_addr; - tmp_port = tcph->source; - tcph->source = tcph->dest; - tcph->dest = tmp_port; - - /* Truncate to length (no data) */ - tcph->doff = sizeof(struct tcphdr)/4; - skb_trim(nskb, ip_hdrlen(nskb) + sizeof(struct tcphdr)); - niph->tot_len = htons(nskb->len); - - if (tcph->ack) { - needs_ack = 0; + skb_reserve(nskb, LL_MAX_HEADER); + + skb_reset_network_header(nskb); + niph = (struct iphdr *)skb_put(nskb, sizeof(struct iphdr)); + niph->version = 4; + niph->ihl = sizeof(struct iphdr) / 4; + niph->tos = 0; + niph->id = 0; + niph->frag_off = htons(IP_DF); + niph->protocol = IPPROTO_TCP; + niph->check = 0; + niph->saddr = oiph->daddr; + niph->daddr = oiph->saddr; + + tcph = (struct tcphdr *)skb_put(nskb, sizeof(struct tcphdr)); + memset(tcph, 0, sizeof(*tcph)); + tcph->source = oth->dest; + tcph->dest = oth->source; + tcph->doff = sizeof(struct tcphdr) / 4; + + if (oth->ack) tcph->seq = oth->ack_seq; - tcph->ack_seq = 0; - } else { - needs_ack = 1; + else { tcph->ack_seq = htonl(ntohl(oth->seq) + oth->syn + oth->fin + oldskb->len - ip_hdrlen(oldskb) - (oth->doff << 2)); - tcph->seq = 0; + tcph->ack = 1; } - /* Reset flags */ - ((u_int8_t *)tcph)[13] = 0; - tcph->rst = 1; - tcph->ack = needs_ack; - - tcph->window = 0; - tcph->urg_ptr = 0; - - /* Adjust TCP checksum */ - tcph->check = 0; - tcph->check = tcp_v4_check(sizeof(struct tcphdr), - niph->saddr, niph->daddr, - csum_partial(tcph, - sizeof(struct tcphdr), 0)); - - /* Set DF, id = 0 */ - niph->frag_off = htons(IP_DF); - niph->id = 0; + tcph->rst = 1; + tcph->check = tcp_v4_check(sizeof(struct tcphdr), + niph->saddr, niph->daddr, + csum_partial(tcph, + sizeof(struct tcphdr), 0)); addr_type = RTN_UNSPEC; - if (hook != NF_IP_FORWARD + if (hook != NF_INET_FORWARD #ifdef CONFIG_BRIDGE_NETFILTER || (nskb->nf_bridge && nskb->nf_bridge->mask & BRNF_BRIDGED) #endif ) addr_type = RTN_LOCAL; + /* ip_route_me_harder expects skb->dst to be set */ + dst_hold(oldskb->dst); + nskb->dst = oldskb->dst; + if (ip_route_me_harder(nskb, addr_type)) goto free_nskb; + niph->ttl = dst_metric(nskb->dst, RTAX_HOPLIMIT); nskb->ip_summed = CHECKSUM_NONE; - /* Adjust IP TTL */ - niph->ttl = dst_metric(nskb->dst, RTAX_HOPLIMIT); - - /* Adjust IP checksum */ - niph->check = 0; - niph->check = ip_fast_csum(skb_network_header(nskb), niph->ihl); - /* "Never happens" */ if (nskb->len > dst_mtu(nskb->dst)) goto free_nskb; nf_ct_attach(nskb, oldskb); - NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, nskb, NULL, nskb->dst->dev, - dst_output); + ip_local_out(nskb); return; free_nskb: @@ -162,20 +133,13 @@ static inline void send_unreach(struct sk_buff *skb_in, int code) icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0); } -static unsigned int reject(struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) +static unsigned int +reject_tg(struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, unsigned int hooknum, + const struct xt_target *target, const void *targinfo) { const struct ipt_reject_info *reject = targinfo; - /* Our naive response construction doesn't deal with IP - options, and probably shouldn't try. */ - if (ip_hdrlen(skb) != sizeof(struct iphdr)) - return NF_DROP; - /* WARNING: This code causes reentry within iptables. This means that the iptables jump stack is now crap. We must return an absolute verdict. --RR */ @@ -211,11 +175,10 @@ static unsigned int reject(struct sk_buff *skb, return NF_DROP; } -static bool check(const char *tablename, - const void *e_void, - const struct xt_target *target, - void *targinfo, - unsigned int hook_mask) +static bool +reject_tg_check(const char *tablename, const void *e_void, + const struct xt_target *target, void *targinfo, + unsigned int hook_mask) { const struct ipt_reject_info *rejinfo = targinfo; const struct ipt_entry *e = e_void; @@ -234,27 +197,27 @@ static bool check(const char *tablename, return true; } -static struct xt_target ipt_reject_reg __read_mostly = { +static struct xt_target reject_tg_reg __read_mostly = { .name = "REJECT", .family = AF_INET, - .target = reject, + .target = reject_tg, .targetsize = sizeof(struct ipt_reject_info), .table = "filter", - .hooks = (1 << NF_IP_LOCAL_IN) | (1 << NF_IP_FORWARD) | - (1 << NF_IP_LOCAL_OUT), - .checkentry = check, + .hooks = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD) | + (1 << NF_INET_LOCAL_OUT), + .checkentry = reject_tg_check, .me = THIS_MODULE, }; -static int __init ipt_reject_init(void) +static int __init reject_tg_init(void) { - return xt_register_target(&ipt_reject_reg); + return xt_register_target(&reject_tg_reg); } -static void __exit ipt_reject_fini(void) +static void __exit reject_tg_exit(void) { - xt_unregister_target(&ipt_reject_reg); + xt_unregister_target(&reject_tg_reg); } -module_init(ipt_reject_init); -module_exit(ipt_reject_fini); +module_init(reject_tg_init); +module_exit(reject_tg_exit); diff --git a/net/ipv4/netfilter/ipt_SAME.c b/net/ipv4/netfilter/ipt_SAME.c deleted file mode 100644 index 8988571436b8..000000000000 --- a/net/ipv4/netfilter/ipt_SAME.c +++ /dev/null @@ -1,179 +0,0 @@ -/* Same. Just like SNAT, only try to make the connections - * between client A and server B always have the same source ip. - * - * (C) 2000 Paul `Rusty' Russell - * (C) 2001 Martin Josefsson - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ -#include <linux/types.h> -#include <linux/ip.h> -#include <linux/timer.h> -#include <linux/module.h> -#include <linux/netfilter.h> -#include <linux/netdevice.h> -#include <linux/if.h> -#include <linux/inetdevice.h> -#include <net/protocol.h> -#include <net/checksum.h> -#include <linux/netfilter_ipv4.h> -#include <linux/netfilter/x_tables.h> -#include <net/netfilter/nf_nat_rule.h> -#include <linux/netfilter_ipv4/ipt_SAME.h> - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Martin Josefsson <gandalf@wlug.westbo.se>"); -MODULE_DESCRIPTION("iptables special SNAT module for consistent sourceip"); - -static bool -same_check(const char *tablename, - const void *e, - const struct xt_target *target, - void *targinfo, - unsigned int hook_mask) -{ - unsigned int count, countess, rangeip, index = 0; - struct ipt_same_info *mr = targinfo; - - mr->ipnum = 0; - - if (mr->rangesize < 1) { - pr_debug("same_check: need at least one dest range.\n"); - return false; - } - if (mr->rangesize > IPT_SAME_MAX_RANGE) { - pr_debug("same_check: too many ranges specified, maximum " - "is %u ranges\n", IPT_SAME_MAX_RANGE); - return false; - } - for (count = 0; count < mr->rangesize; count++) { - if (ntohl(mr->range[count].min_ip) > - ntohl(mr->range[count].max_ip)) { - pr_debug("same_check: min_ip is larger than max_ip in " - "range `%u.%u.%u.%u-%u.%u.%u.%u'.\n", - NIPQUAD(mr->range[count].min_ip), - NIPQUAD(mr->range[count].max_ip)); - return false; - } - if (!(mr->range[count].flags & IP_NAT_RANGE_MAP_IPS)) { - pr_debug("same_check: bad MAP_IPS.\n"); - return false; - } - rangeip = (ntohl(mr->range[count].max_ip) - - ntohl(mr->range[count].min_ip) + 1); - mr->ipnum += rangeip; - - pr_debug("same_check: range %u, ipnum = %u\n", count, rangeip); - } - pr_debug("same_check: total ipaddresses = %u\n", mr->ipnum); - - mr->iparray = kmalloc((sizeof(u_int32_t) * mr->ipnum), GFP_KERNEL); - if (!mr->iparray) { - pr_debug("same_check: Couldn't allocate %Zu bytes " - "for %u ipaddresses!\n", - (sizeof(u_int32_t) * mr->ipnum), mr->ipnum); - return false; - } - pr_debug("same_check: Allocated %Zu bytes for %u ipaddresses.\n", - (sizeof(u_int32_t) * mr->ipnum), mr->ipnum); - - for (count = 0; count < mr->rangesize; count++) { - for (countess = ntohl(mr->range[count].min_ip); - countess <= ntohl(mr->range[count].max_ip); - countess++) { - mr->iparray[index] = countess; - pr_debug("same_check: Added ipaddress `%u.%u.%u.%u' " - "in index %u.\n", HIPQUAD(countess), index); - index++; - } - } - return true; -} - -static void -same_destroy(const struct xt_target *target, void *targinfo) -{ - struct ipt_same_info *mr = targinfo; - - kfree(mr->iparray); - - pr_debug("same_destroy: Deallocated %Zu bytes for %u ipaddresses.\n", - (sizeof(u_int32_t) * mr->ipnum), mr->ipnum); -} - -static unsigned int -same_target(struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) -{ - struct nf_conn *ct; - enum ip_conntrack_info ctinfo; - u_int32_t tmpip, aindex; - __be32 new_ip; - const struct ipt_same_info *same = targinfo; - struct nf_nat_range newrange; - const struct nf_conntrack_tuple *t; - - NF_CT_ASSERT(hooknum == NF_IP_PRE_ROUTING || - hooknum == NF_IP_POST_ROUTING); - ct = nf_ct_get(skb, &ctinfo); - - t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; - - /* Base new source on real src ip and optionally dst ip, - giving some hope for consistency across reboots. - Here we calculate the index in same->iparray which - holds the ipaddress we should use */ - - tmpip = ntohl(t->src.u3.ip); - - if (!(same->info & IPT_SAME_NODST)) - tmpip += ntohl(t->dst.u3.ip); - aindex = tmpip % same->ipnum; - - new_ip = htonl(same->iparray[aindex]); - - pr_debug("ipt_SAME: src=%u.%u.%u.%u dst=%u.%u.%u.%u, " - "new src=%u.%u.%u.%u\n", - NIPQUAD(t->src.u3.ip), NIPQUAD(t->dst.u3.ip), NIPQUAD(new_ip)); - - /* Transfer from original range. */ - newrange = ((struct nf_nat_range) - { same->range[0].flags, new_ip, new_ip, - /* FIXME: Use ports from correct range! */ - same->range[0].min, same->range[0].max }); - - /* Hand modified range to generic setup. */ - return nf_nat_setup_info(ct, &newrange, hooknum); -} - -static struct xt_target same_reg __read_mostly = { - .name = "SAME", - .family = AF_INET, - .target = same_target, - .targetsize = sizeof(struct ipt_same_info), - .table = "nat", - .hooks = (1 << NF_IP_PRE_ROUTING | 1 << NF_IP_POST_ROUTING), - .checkentry = same_check, - .destroy = same_destroy, - .me = THIS_MODULE, -}; - -static int __init ipt_same_init(void) -{ - return xt_register_target(&same_reg); -} - -static void __exit ipt_same_fini(void) -{ - xt_unregister_target(&same_reg); -} - -module_init(ipt_same_init); -module_exit(ipt_same_fini); - diff --git a/net/ipv4/netfilter/ipt_TOS.c b/net/ipv4/netfilter/ipt_TOS.c deleted file mode 100644 index d4573baa7f27..000000000000 --- a/net/ipv4/netfilter/ipt_TOS.c +++ /dev/null @@ -1,87 +0,0 @@ -/* This is a module which is used for setting the TOS field of a packet. */ - -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/module.h> -#include <linux/skbuff.h> -#include <linux/ip.h> -#include <net/checksum.h> - -#include <linux/netfilter/x_tables.h> -#include <linux/netfilter_ipv4/ipt_TOS.h> - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); -MODULE_DESCRIPTION("iptables TOS mangling module"); - -static unsigned int -target(struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) -{ - const struct ipt_tos_target_info *tosinfo = targinfo; - struct iphdr *iph = ip_hdr(skb); - - if ((iph->tos & IPTOS_TOS_MASK) != tosinfo->tos) { - __u8 oldtos; - if (!skb_make_writable(skb, sizeof(struct iphdr))) - return NF_DROP; - iph = ip_hdr(skb); - oldtos = iph->tos; - iph->tos = (iph->tos & IPTOS_PREC_MASK) | tosinfo->tos; - nf_csum_replace2(&iph->check, htons(oldtos), htons(iph->tos)); - } - return XT_CONTINUE; -} - -static bool -checkentry(const char *tablename, - const void *e_void, - const struct xt_target *target, - void *targinfo, - unsigned int hook_mask) -{ - const u_int8_t tos = ((struct ipt_tos_target_info *)targinfo)->tos; - - if (tos != IPTOS_LOWDELAY - && tos != IPTOS_THROUGHPUT - && tos != IPTOS_RELIABILITY - && tos != IPTOS_MINCOST - && tos != IPTOS_NORMALSVC) { - printk(KERN_WARNING "TOS: bad tos value %#x\n", tos); - return false; - } - return true; -} - -static struct xt_target ipt_tos_reg __read_mostly = { - .name = "TOS", - .family = AF_INET, - .target = target, - .targetsize = sizeof(struct ipt_tos_target_info), - .table = "mangle", - .checkentry = checkentry, - .me = THIS_MODULE, -}; - -static int __init ipt_tos_init(void) -{ - return xt_register_target(&ipt_tos_reg); -} - -static void __exit ipt_tos_fini(void) -{ - xt_unregister_target(&ipt_tos_reg); -} - -module_init(ipt_tos_init); -module_exit(ipt_tos_fini); diff --git a/net/ipv4/netfilter/ipt_TTL.c b/net/ipv4/netfilter/ipt_TTL.c index c620a0527666..30eed65e7338 100644 --- a/net/ipv4/netfilter/ipt_TTL.c +++ b/net/ipv4/netfilter/ipt_TTL.c @@ -16,14 +16,13 @@ #include <linux/netfilter_ipv4/ipt_TTL.h> MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); -MODULE_DESCRIPTION("IP tables TTL modification module"); +MODULE_DESCRIPTION("Xtables: IPv4 TTL field modification target"); MODULE_LICENSE("GPL"); static unsigned int -ipt_ttl_target(struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - unsigned int hooknum, const struct xt_target *target, - const void *targinfo) +ttl_tg(struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, unsigned int hooknum, + const struct xt_target *target, const void *targinfo) { struct iphdr *iph; const struct ipt_TTL_info *info = targinfo; @@ -54,19 +53,18 @@ ipt_ttl_target(struct sk_buff *skb, } if (new_ttl != iph->ttl) { - nf_csum_replace2(&iph->check, htons(iph->ttl << 8), - htons(new_ttl << 8)); + csum_replace2(&iph->check, htons(iph->ttl << 8), + htons(new_ttl << 8)); iph->ttl = new_ttl; } return XT_CONTINUE; } -static bool ipt_ttl_checkentry(const char *tablename, - const void *e, - const struct xt_target *target, - void *targinfo, - unsigned int hook_mask) +static bool +ttl_tg_check(const char *tablename, const void *e, + const struct xt_target *target, void *targinfo, + unsigned int hook_mask) { const struct ipt_TTL_info *info = targinfo; @@ -80,25 +78,25 @@ static bool ipt_ttl_checkentry(const char *tablename, return true; } -static struct xt_target ipt_TTL __read_mostly = { +static struct xt_target ttl_tg_reg __read_mostly = { .name = "TTL", .family = AF_INET, - .target = ipt_ttl_target, + .target = ttl_tg, .targetsize = sizeof(struct ipt_TTL_info), .table = "mangle", - .checkentry = ipt_ttl_checkentry, + .checkentry = ttl_tg_check, .me = THIS_MODULE, }; -static int __init ipt_ttl_init(void) +static int __init ttl_tg_init(void) { - return xt_register_target(&ipt_TTL); + return xt_register_target(&ttl_tg_reg); } -static void __exit ipt_ttl_fini(void) +static void __exit ttl_tg_exit(void) { - xt_unregister_target(&ipt_TTL); + xt_unregister_target(&ttl_tg_reg); } -module_init(ipt_ttl_init); -module_exit(ipt_ttl_fini); +module_init(ttl_tg_init); +module_exit(ttl_tg_exit); diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c index 212b830765a4..b192756c6d0d 100644 --- a/net/ipv4/netfilter/ipt_ULOG.c +++ b/net/ipv4/netfilter/ipt_ULOG.c @@ -43,13 +43,14 @@ #include <linux/netfilter.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv4/ipt_ULOG.h> +#include <net/netfilter/nf_log.h> #include <net/sock.h> #include <linux/bitops.h> #include <asm/unaligned.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>"); -MODULE_DESCRIPTION("iptables userspace logging module"); +MODULE_DESCRIPTION("Xtables: packet logging to netlink using ULOG"); MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NFLOG); #define ULOG_NL_EVENT 111 /* Harald's favorite number */ @@ -279,12 +280,10 @@ alloc_failure: spin_unlock_bh(&ulog_lock); } -static unsigned int ipt_ulog_target(struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) +static unsigned int +ulog_tg(struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, unsigned int hooknum, + const struct xt_target *target, const void *targinfo) { struct ipt_ulog_info *loginfo = (struct ipt_ulog_info *) targinfo; @@ -318,11 +317,10 @@ static void ipt_logfn(unsigned int pf, ipt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix); } -static bool ipt_ulog_checkentry(const char *tablename, - const void *e, - const struct xt_target *target, - void *targinfo, - unsigned int hookmask) +static bool +ulog_tg_check(const char *tablename, const void *e, + const struct xt_target *target, void *targinfo, + unsigned int hookmask) { const struct ipt_ulog_info *loginfo = targinfo; @@ -347,7 +345,7 @@ struct compat_ipt_ulog_info { char prefix[ULOG_PREFIX_LEN]; }; -static void compat_from_user(void *dst, void *src) +static void ulog_tg_compat_from_user(void *dst, void *src) { const struct compat_ipt_ulog_info *cl = src; struct ipt_ulog_info l = { @@ -360,7 +358,7 @@ static void compat_from_user(void *dst, void *src) memcpy(dst, &l, sizeof(l)); } -static int compat_to_user(void __user *dst, void *src) +static int ulog_tg_compat_to_user(void __user *dst, void *src) { const struct ipt_ulog_info *l = src; struct compat_ipt_ulog_info cl = { @@ -374,16 +372,16 @@ static int compat_to_user(void __user *dst, void *src) } #endif /* CONFIG_COMPAT */ -static struct xt_target ipt_ulog_reg __read_mostly = { +static struct xt_target ulog_tg_reg __read_mostly = { .name = "ULOG", .family = AF_INET, - .target = ipt_ulog_target, + .target = ulog_tg, .targetsize = sizeof(struct ipt_ulog_info), - .checkentry = ipt_ulog_checkentry, + .checkentry = ulog_tg_check, #ifdef CONFIG_COMPAT .compatsize = sizeof(struct compat_ipt_ulog_info), - .compat_from_user = compat_from_user, - .compat_to_user = compat_to_user, + .compat_from_user = ulog_tg_compat_from_user, + .compat_to_user = ulog_tg_compat_to_user, #endif .me = THIS_MODULE, }; @@ -394,7 +392,7 @@ static struct nf_logger ipt_ulog_logger = { .me = THIS_MODULE, }; -static int __init ipt_ulog_init(void) +static int __init ulog_tg_init(void) { int ret, i; @@ -415,9 +413,9 @@ static int __init ipt_ulog_init(void) if (!nflognl) return -ENOMEM; - ret = xt_register_target(&ipt_ulog_reg); + ret = xt_register_target(&ulog_tg_reg); if (ret < 0) { - sock_release(nflognl->sk_socket); + netlink_kernel_release(nflognl); return ret; } if (nflog) @@ -426,7 +424,7 @@ static int __init ipt_ulog_init(void) return 0; } -static void __exit ipt_ulog_fini(void) +static void __exit ulog_tg_exit(void) { ulog_buff_t *ub; int i; @@ -435,8 +433,8 @@ static void __exit ipt_ulog_fini(void) if (nflog) nf_log_unregister(&ipt_ulog_logger); - xt_unregister_target(&ipt_ulog_reg); - sock_release(nflognl->sk_socket); + xt_unregister_target(&ulog_tg_reg); + netlink_kernel_release(nflognl); /* remove pending timers and free allocated skb's */ for (i = 0; i < ULOG_MAXNLGROUPS; i++) { @@ -453,5 +451,5 @@ static void __exit ipt_ulog_fini(void) } } -module_init(ipt_ulog_init); -module_exit(ipt_ulog_fini); +module_init(ulog_tg_init); +module_exit(ulog_tg_exit); diff --git a/net/ipv4/netfilter/ipt_addrtype.c b/net/ipv4/netfilter/ipt_addrtype.c index 59f01f7ba6b4..49587a497229 100644 --- a/net/ipv4/netfilter/ipt_addrtype.c +++ b/net/ipv4/netfilter/ipt_addrtype.c @@ -2,6 +2,7 @@ * iptables module to match inet_addr_type() of an ip. * * Copyright (c) 2004 Patrick McHardy <kaber@trash.net> + * (C) 2007 Laszlo Attila Toth <panther@balabit.hu> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -20,47 +21,119 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_DESCRIPTION("iptables addrtype match"); +MODULE_DESCRIPTION("Xtables: address type match for IPv4"); -static inline bool match_type(__be32 addr, u_int16_t mask) +static inline bool match_type(const struct net_device *dev, __be32 addr, + u_int16_t mask) { - return !!(mask & (1 << inet_addr_type(addr))); + return !!(mask & (1 << inet_dev_addr_type(&init_net, dev, addr))); } -static bool match(const struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - const struct xt_match *match, const void *matchinfo, - int offset, unsigned int protoff, bool *hotdrop) +static bool +addrtype_mt_v0(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct xt_match *match, + const void *matchinfo, int offset, unsigned int protoff, + bool *hotdrop) { const struct ipt_addrtype_info *info = matchinfo; const struct iphdr *iph = ip_hdr(skb); bool ret = true; if (info->source) - ret &= match_type(iph->saddr, info->source)^info->invert_source; + ret &= match_type(NULL, iph->saddr, info->source) ^ + info->invert_source; if (info->dest) - ret &= match_type(iph->daddr, info->dest)^info->invert_dest; + ret &= match_type(NULL, iph->daddr, info->dest) ^ + info->invert_dest; return ret; } -static struct xt_match addrtype_match __read_mostly = { - .name = "addrtype", - .family = AF_INET, - .match = match, - .matchsize = sizeof(struct ipt_addrtype_info), - .me = THIS_MODULE +static bool +addrtype_mt_v1(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct xt_match *match, + const void *matchinfo, int offset, unsigned int protoff, + bool *hotdrop) +{ + const struct ipt_addrtype_info_v1 *info = matchinfo; + const struct iphdr *iph = ip_hdr(skb); + const struct net_device *dev = NULL; + bool ret = true; + + if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN) + dev = in; + else if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) + dev = out; + + if (info->source) + ret &= match_type(dev, iph->saddr, info->source) ^ + (info->flags & IPT_ADDRTYPE_INVERT_SOURCE); + if (ret && info->dest) + ret &= match_type(dev, iph->daddr, info->dest) ^ + (info->flags & IPT_ADDRTYPE_INVERT_DEST); + return ret; +} + +static bool +addrtype_mt_checkentry_v1(const char *tablename, const void *ip_void, + const struct xt_match *match, void *matchinfo, + unsigned int hook_mask) +{ + struct ipt_addrtype_info_v1 *info = matchinfo; + + if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN && + info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) { + printk(KERN_ERR "ipt_addrtype: both incoming and outgoing " + "interface limitation cannot be selected\n"); + return false; + } + + if (hook_mask & (1 << NF_INET_PRE_ROUTING | 1 << NF_INET_LOCAL_IN) && + info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) { + printk(KERN_ERR "ipt_addrtype: output interface limitation " + "not valid in PRE_ROUTING and INPUT\n"); + return false; + } + + if (hook_mask & (1 << NF_INET_POST_ROUTING | 1 << NF_INET_LOCAL_OUT) && + info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN) { + printk(KERN_ERR "ipt_addrtype: input interface limitation " + "not valid in POST_ROUTING and OUTPUT\n"); + return false; + } + + return true; +} + +static struct xt_match addrtype_mt_reg[] __read_mostly = { + { + .name = "addrtype", + .family = AF_INET, + .match = addrtype_mt_v0, + .matchsize = sizeof(struct ipt_addrtype_info), + .me = THIS_MODULE + }, + { + .name = "addrtype", + .family = AF_INET, + .revision = 1, + .match = addrtype_mt_v1, + .checkentry = addrtype_mt_checkentry_v1, + .matchsize = sizeof(struct ipt_addrtype_info_v1), + .me = THIS_MODULE + } }; -static int __init ipt_addrtype_init(void) +static int __init addrtype_mt_init(void) { - return xt_register_match(&addrtype_match); + return xt_register_matches(addrtype_mt_reg, + ARRAY_SIZE(addrtype_mt_reg)); } -static void __exit ipt_addrtype_fini(void) +static void __exit addrtype_mt_exit(void) { - xt_unregister_match(&addrtype_match); + xt_unregister_matches(addrtype_mt_reg, ARRAY_SIZE(addrtype_mt_reg)); } -module_init(ipt_addrtype_init); -module_exit(ipt_addrtype_fini); +module_init(addrtype_mt_init); +module_exit(addrtype_mt_exit); diff --git a/net/ipv4/netfilter/ipt_ah.c b/net/ipv4/netfilter/ipt_ah.c index 61b017fd743c..e977989629c7 100644 --- a/net/ipv4/netfilter/ipt_ah.c +++ b/net/ipv4/netfilter/ipt_ah.c @@ -16,7 +16,7 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Yon Uriarte <yon@astaro.de>"); -MODULE_DESCRIPTION("iptables AH SPI match module"); +MODULE_DESCRIPTION("Xtables: IPv4 IPsec-AH SPI match"); #ifdef DEBUG_CONNTRACK #define duprintf(format, args...) printk(format , ## args) @@ -37,14 +37,9 @@ spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, bool invert) } static bool -match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - bool *hotdrop) +ah_mt(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct xt_match *match, + const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop) { struct ip_auth_hdr _ahdr; const struct ip_auth_hdr *ah; @@ -72,11 +67,9 @@ match(const struct sk_buff *skb, /* Called when user tries to insert an entry of this type. */ static bool -checkentry(const char *tablename, - const void *ip_void, - const struct xt_match *match, - void *matchinfo, - unsigned int hook_mask) +ah_mt_check(const char *tablename, const void *ip_void, + const struct xt_match *match, void *matchinfo, + unsigned int hook_mask) { const struct ipt_ah *ahinfo = matchinfo; @@ -88,25 +81,25 @@ checkentry(const char *tablename, return true; } -static struct xt_match ah_match __read_mostly = { +static struct xt_match ah_mt_reg __read_mostly = { .name = "ah", .family = AF_INET, - .match = match, + .match = ah_mt, .matchsize = sizeof(struct ipt_ah), .proto = IPPROTO_AH, - .checkentry = checkentry, + .checkentry = ah_mt_check, .me = THIS_MODULE, }; -static int __init ipt_ah_init(void) +static int __init ah_mt_init(void) { - return xt_register_match(&ah_match); + return xt_register_match(&ah_mt_reg); } -static void __exit ipt_ah_fini(void) +static void __exit ah_mt_exit(void) { - xt_unregister_match(&ah_match); + xt_unregister_match(&ah_mt_reg); } -module_init(ipt_ah_init); -module_exit(ipt_ah_fini); +module_init(ah_mt_init); +module_exit(ah_mt_exit); diff --git a/net/ipv4/netfilter/ipt_ecn.c b/net/ipv4/netfilter/ipt_ecn.c index d6925c674069..749de8284ce5 100644 --- a/net/ipv4/netfilter/ipt_ecn.c +++ b/net/ipv4/netfilter/ipt_ecn.c @@ -19,7 +19,7 @@ #include <linux/netfilter_ipv4/ipt_ecn.h> MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); -MODULE_DESCRIPTION("iptables ECN matching module"); +MODULE_DESCRIPTION("Xtables: Explicit Congestion Notification (ECN) flag match for IPv4"); MODULE_LICENSE("GPL"); static inline bool match_ip(const struct sk_buff *skb, @@ -67,10 +67,10 @@ static inline bool match_tcp(const struct sk_buff *skb, return true; } -static bool match(const struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - const struct xt_match *match, const void *matchinfo, - int offset, unsigned int protoff, bool *hotdrop) +static bool +ecn_mt(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct xt_match *match, + const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop) { const struct ipt_ecn_info *info = matchinfo; @@ -88,9 +88,10 @@ static bool match(const struct sk_buff *skb, return true; } -static bool checkentry(const char *tablename, const void *ip_void, - const struct xt_match *match, - void *matchinfo, unsigned int hook_mask) +static bool +ecn_mt_check(const char *tablename, const void *ip_void, + const struct xt_match *match, void *matchinfo, + unsigned int hook_mask) { const struct ipt_ecn_info *info = matchinfo; const struct ipt_ip *ip = ip_void; @@ -111,24 +112,24 @@ static bool checkentry(const char *tablename, const void *ip_void, return true; } -static struct xt_match ecn_match __read_mostly = { +static struct xt_match ecn_mt_reg __read_mostly = { .name = "ecn", .family = AF_INET, - .match = match, + .match = ecn_mt, .matchsize = sizeof(struct ipt_ecn_info), - .checkentry = checkentry, + .checkentry = ecn_mt_check, .me = THIS_MODULE, }; -static int __init ipt_ecn_init(void) +static int __init ecn_mt_init(void) { - return xt_register_match(&ecn_match); + return xt_register_match(&ecn_mt_reg); } -static void __exit ipt_ecn_fini(void) +static void __exit ecn_mt_exit(void) { - xt_unregister_match(&ecn_match); + xt_unregister_match(&ecn_mt_reg); } -module_init(ipt_ecn_init); -module_exit(ipt_ecn_fini); +module_init(ecn_mt_init); +module_exit(ecn_mt_exit); diff --git a/net/ipv4/netfilter/ipt_iprange.c b/net/ipv4/netfilter/ipt_iprange.c deleted file mode 100644 index 0106dc955a69..000000000000 --- a/net/ipv4/netfilter/ipt_iprange.c +++ /dev/null @@ -1,79 +0,0 @@ -/* - * iptables module to match IP address ranges - * - * (C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ -#include <linux/module.h> -#include <linux/skbuff.h> -#include <linux/ip.h> -#include <linux/netfilter/x_tables.h> -#include <linux/netfilter_ipv4/ipt_iprange.h> - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); -MODULE_DESCRIPTION("iptables arbitrary IP range match module"); - -static bool -match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, unsigned int protoff, bool *hotdrop) -{ - const struct ipt_iprange_info *info = matchinfo; - const struct iphdr *iph = ip_hdr(skb); - - if (info->flags & IPRANGE_SRC) { - if ((ntohl(iph->saddr) < ntohl(info->src.min_ip) - || ntohl(iph->saddr) > ntohl(info->src.max_ip)) - ^ !!(info->flags & IPRANGE_SRC_INV)) { - pr_debug("src IP %u.%u.%u.%u NOT in range %s" - "%u.%u.%u.%u-%u.%u.%u.%u\n", - NIPQUAD(iph->saddr), - info->flags & IPRANGE_SRC_INV ? "(INV) " : "", - NIPQUAD(info->src.min_ip), - NIPQUAD(info->src.max_ip)); - return false; - } - } - if (info->flags & IPRANGE_DST) { - if ((ntohl(iph->daddr) < ntohl(info->dst.min_ip) - || ntohl(iph->daddr) > ntohl(info->dst.max_ip)) - ^ !!(info->flags & IPRANGE_DST_INV)) { - pr_debug("dst IP %u.%u.%u.%u NOT in range %s" - "%u.%u.%u.%u-%u.%u.%u.%u\n", - NIPQUAD(iph->daddr), - info->flags & IPRANGE_DST_INV ? "(INV) " : "", - NIPQUAD(info->dst.min_ip), - NIPQUAD(info->dst.max_ip)); - return false; - } - } - return true; -} - -static struct xt_match iprange_match __read_mostly = { - .name = "iprange", - .family = AF_INET, - .match = match, - .matchsize = sizeof(struct ipt_iprange_info), - .me = THIS_MODULE -}; - -static int __init ipt_iprange_init(void) -{ - return xt_register_match(&iprange_match); -} - -static void __exit ipt_iprange_fini(void) -{ - xt_unregister_match(&iprange_match); -} - -module_init(ipt_iprange_init); -module_exit(ipt_iprange_fini); diff --git a/net/ipv4/netfilter/ipt_owner.c b/net/ipv4/netfilter/ipt_owner.c deleted file mode 100644 index b14e77da7a33..000000000000 --- a/net/ipv4/netfilter/ipt_owner.c +++ /dev/null @@ -1,92 +0,0 @@ -/* Kernel module to match various things tied to sockets associated with - locally generated outgoing packets. */ - -/* (C) 2000 Marc Boucher <marc@mbsi.ca> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/module.h> -#include <linux/skbuff.h> -#include <linux/file.h> -#include <linux/rcupdate.h> -#include <net/sock.h> - -#include <linux/netfilter_ipv4/ipt_owner.h> -#include <linux/netfilter/x_tables.h> - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>"); -MODULE_DESCRIPTION("iptables owner match"); - -static bool -match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - bool *hotdrop) -{ - const struct ipt_owner_info *info = matchinfo; - - if (!skb->sk || !skb->sk->sk_socket || !skb->sk->sk_socket->file) - return false; - - if(info->match & IPT_OWNER_UID) { - if ((skb->sk->sk_socket->file->f_uid != info->uid) ^ - !!(info->invert & IPT_OWNER_UID)) - return false; - } - - if(info->match & IPT_OWNER_GID) { - if ((skb->sk->sk_socket->file->f_gid != info->gid) ^ - !!(info->invert & IPT_OWNER_GID)) - return false; - } - - return true; -} - -static bool -checkentry(const char *tablename, - const void *ip, - const struct xt_match *match, - void *matchinfo, - unsigned int hook_mask) -{ - const struct ipt_owner_info *info = matchinfo; - - if (info->match & (IPT_OWNER_PID|IPT_OWNER_SID|IPT_OWNER_COMM)) { - printk("ipt_owner: pid, sid and command matching " - "not supported anymore\n"); - return false; - } - return true; -} - -static struct xt_match owner_match __read_mostly = { - .name = "owner", - .family = AF_INET, - .match = match, - .matchsize = sizeof(struct ipt_owner_info), - .hooks = (1 << NF_IP_LOCAL_OUT) | (1 << NF_IP_POST_ROUTING), - .checkentry = checkentry, - .me = THIS_MODULE, -}; - -static int __init ipt_owner_init(void) -{ - return xt_register_match(&owner_match); -} - -static void __exit ipt_owner_fini(void) -{ - xt_unregister_match(&owner_match); -} - -module_init(ipt_owner_init); -module_exit(ipt_owner_fini); diff --git a/net/ipv4/netfilter/ipt_recent.c b/net/ipv4/netfilter/ipt_recent.c index 11d39fb5f38b..e3154a99c08a 100644 --- a/net/ipv4/netfilter/ipt_recent.c +++ b/net/ipv4/netfilter/ipt_recent.c @@ -30,7 +30,7 @@ #include <linux/netfilter_ipv4/ipt_recent.h> MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_DESCRIPTION("IP tables recently seen matching module"); +MODULE_DESCRIPTION("Xtables: \"recently-seen\" host matching for IPv4"); MODULE_LICENSE("GPL"); static unsigned int ip_list_tot = 100; @@ -170,10 +170,10 @@ static void recent_table_flush(struct recent_table *t) } static bool -ipt_recent_match(const struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - const struct xt_match *match, const void *matchinfo, - int offset, unsigned int protoff, bool *hotdrop) +recent_mt(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct xt_match *match, + const void *matchinfo, int offset, unsigned int protoff, + bool *hotdrop) { const struct ipt_recent_info *info = matchinfo; struct recent_table *t; @@ -236,9 +236,9 @@ out: } static bool -ipt_recent_checkentry(const char *tablename, const void *ip, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +recent_mt_check(const char *tablename, const void *ip, + const struct xt_match *match, void *matchinfo, + unsigned int hook_mask) { const struct ipt_recent_info *info = matchinfo; struct recent_table *t; @@ -293,8 +293,7 @@ out: return ret; } -static void -ipt_recent_destroy(const struct xt_match *match, void *matchinfo) +static void recent_mt_destroy(const struct xt_match *match, void *matchinfo) { const struct ipt_recent_info *info = matchinfo; struct recent_table *t; @@ -455,17 +454,17 @@ static const struct file_operations recent_fops = { }; #endif /* CONFIG_PROC_FS */ -static struct xt_match recent_match __read_mostly = { +static struct xt_match recent_mt_reg __read_mostly = { .name = "recent", .family = AF_INET, - .match = ipt_recent_match, + .match = recent_mt, .matchsize = sizeof(struct ipt_recent_info), - .checkentry = ipt_recent_checkentry, - .destroy = ipt_recent_destroy, + .checkentry = recent_mt_check, + .destroy = recent_mt_destroy, .me = THIS_MODULE, }; -static int __init ipt_recent_init(void) +static int __init recent_mt_init(void) { int err; @@ -473,27 +472,27 @@ static int __init ipt_recent_init(void) return -EINVAL; ip_list_hash_size = 1 << fls(ip_list_tot); - err = xt_register_match(&recent_match); + err = xt_register_match(&recent_mt_reg); #ifdef CONFIG_PROC_FS if (err) return err; proc_dir = proc_mkdir("ipt_recent", init_net.proc_net); if (proc_dir == NULL) { - xt_unregister_match(&recent_match); + xt_unregister_match(&recent_mt_reg); err = -ENOMEM; } #endif return err; } -static void __exit ipt_recent_exit(void) +static void __exit recent_mt_exit(void) { BUG_ON(!list_empty(&tables)); - xt_unregister_match(&recent_match); + xt_unregister_match(&recent_mt_reg); #ifdef CONFIG_PROC_FS remove_proc_entry("ipt_recent", init_net.proc_net); #endif } -module_init(ipt_recent_init); -module_exit(ipt_recent_exit); +module_init(recent_mt_init); +module_exit(recent_mt_exit); diff --git a/net/ipv4/netfilter/ipt_tos.c b/net/ipv4/netfilter/ipt_tos.c deleted file mode 100644 index e740441c973d..000000000000 --- a/net/ipv4/netfilter/ipt_tos.c +++ /dev/null @@ -1,55 +0,0 @@ -/* Kernel module to match TOS values. */ - -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/ip.h> -#include <linux/module.h> -#include <linux/skbuff.h> - -#include <linux/netfilter_ipv4/ipt_tos.h> -#include <linux/netfilter/x_tables.h> - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("iptables TOS match module"); - -static bool -match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - bool *hotdrop) -{ - const struct ipt_tos_info *info = matchinfo; - - return (ip_hdr(skb)->tos == info->tos) ^ info->invert; -} - -static struct xt_match tos_match __read_mostly = { - .name = "tos", - .family = AF_INET, - .match = match, - .matchsize = sizeof(struct ipt_tos_info), - .me = THIS_MODULE, -}; - -static int __init ipt_multiport_init(void) -{ - return xt_register_match(&tos_match); -} - -static void __exit ipt_multiport_fini(void) -{ - xt_unregister_match(&tos_match); -} - -module_init(ipt_multiport_init); -module_exit(ipt_multiport_fini); diff --git a/net/ipv4/netfilter/ipt_ttl.c b/net/ipv4/netfilter/ipt_ttl.c index a439900a4ba5..e0b8caeb710c 100644 --- a/net/ipv4/netfilter/ipt_ttl.c +++ b/net/ipv4/netfilter/ipt_ttl.c @@ -15,13 +15,13 @@ #include <linux/netfilter/x_tables.h> MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); -MODULE_DESCRIPTION("IP tables TTL matching module"); +MODULE_DESCRIPTION("Xtables: IPv4 TTL field match"); MODULE_LICENSE("GPL"); -static bool match(const struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - const struct xt_match *match, const void *matchinfo, - int offset, unsigned int protoff, bool *hotdrop) +static bool +ttl_mt(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct xt_match *match, + const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop) { const struct ipt_ttl_info *info = matchinfo; const u8 ttl = ip_hdr(skb)->ttl; @@ -44,23 +44,23 @@ static bool match(const struct sk_buff *skb, return false; } -static struct xt_match ttl_match __read_mostly = { +static struct xt_match ttl_mt_reg __read_mostly = { .name = "ttl", .family = AF_INET, - .match = match, + .match = ttl_mt, .matchsize = sizeof(struct ipt_ttl_info), .me = THIS_MODULE, }; -static int __init ipt_ttl_init(void) +static int __init ttl_mt_init(void) { - return xt_register_match(&ttl_match); + return xt_register_match(&ttl_mt_reg); } -static void __exit ipt_ttl_fini(void) +static void __exit ttl_mt_exit(void) { - xt_unregister_match(&ttl_match); + xt_unregister_match(&ttl_mt_reg); } -module_init(ipt_ttl_init); -module_exit(ipt_ttl_fini); +module_init(ttl_mt_init); +module_exit(ttl_mt_exit); diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c index ba3262c60437..29bb4f9fbda0 100644 --- a/net/ipv4/netfilter/iptable_filter.c +++ b/net/ipv4/netfilter/iptable_filter.c @@ -19,7 +19,9 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); MODULE_DESCRIPTION("iptables filter table"); -#define FILTER_VALID_HOOKS ((1 << NF_IP_LOCAL_IN) | (1 << NF_IP_FORWARD) | (1 << NF_IP_LOCAL_OUT)) +#define FILTER_VALID_HOOKS ((1 << NF_INET_LOCAL_IN) | \ + (1 << NF_INET_FORWARD) | \ + (1 << NF_INET_LOCAL_OUT)) static struct { @@ -33,14 +35,14 @@ static struct .num_entries = 4, .size = sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error), .hook_entry = { - [NF_IP_LOCAL_IN] = 0, - [NF_IP_FORWARD] = sizeof(struct ipt_standard), - [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 2, + [NF_INET_LOCAL_IN] = 0, + [NF_INET_FORWARD] = sizeof(struct ipt_standard), + [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 2, }, .underflow = { - [NF_IP_LOCAL_IN] = 0, - [NF_IP_FORWARD] = sizeof(struct ipt_standard), - [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 2, + [NF_INET_LOCAL_IN] = 0, + [NF_INET_FORWARD] = sizeof(struct ipt_standard), + [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 2, }, }, .entries = { @@ -89,26 +91,26 @@ ipt_local_out_hook(unsigned int hook, return ipt_do_table(skb, hook, in, out, &packet_filter); } -static struct nf_hook_ops ipt_ops[] = { +static struct nf_hook_ops ipt_ops[] __read_mostly = { { .hook = ipt_hook, .owner = THIS_MODULE, .pf = PF_INET, - .hooknum = NF_IP_LOCAL_IN, + .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_FILTER, }, { .hook = ipt_hook, .owner = THIS_MODULE, .pf = PF_INET, - .hooknum = NF_IP_FORWARD, + .hooknum = NF_INET_FORWARD, .priority = NF_IP_PRI_FILTER, }, { .hook = ipt_local_out_hook, .owner = THIS_MODULE, .pf = PF_INET, - .hooknum = NF_IP_LOCAL_OUT, + .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_FILTER, }, }; diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index b4360a69d5ca..5c4be202430c 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -21,11 +21,11 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); MODULE_DESCRIPTION("iptables mangle table"); -#define MANGLE_VALID_HOOKS ((1 << NF_IP_PRE_ROUTING) | \ - (1 << NF_IP_LOCAL_IN) | \ - (1 << NF_IP_FORWARD) | \ - (1 << NF_IP_LOCAL_OUT) | \ - (1 << NF_IP_POST_ROUTING)) +#define MANGLE_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | \ + (1 << NF_INET_LOCAL_IN) | \ + (1 << NF_INET_FORWARD) | \ + (1 << NF_INET_LOCAL_OUT) | \ + (1 << NF_INET_POST_ROUTING)) /* Ouch - five different hooks? Maybe this should be a config option..... -- BC */ static struct @@ -40,18 +40,18 @@ static struct .num_entries = 6, .size = sizeof(struct ipt_standard) * 5 + sizeof(struct ipt_error), .hook_entry = { - [NF_IP_PRE_ROUTING] = 0, - [NF_IP_LOCAL_IN] = sizeof(struct ipt_standard), - [NF_IP_FORWARD] = sizeof(struct ipt_standard) * 2, - [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 3, - [NF_IP_POST_ROUTING] = sizeof(struct ipt_standard) * 4, + [NF_INET_PRE_ROUTING] = 0, + [NF_INET_LOCAL_IN] = sizeof(struct ipt_standard), + [NF_INET_FORWARD] = sizeof(struct ipt_standard) * 2, + [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 3, + [NF_INET_POST_ROUTING] = sizeof(struct ipt_standard) * 4, }, .underflow = { - [NF_IP_PRE_ROUTING] = 0, - [NF_IP_LOCAL_IN] = sizeof(struct ipt_standard), - [NF_IP_FORWARD] = sizeof(struct ipt_standard) * 2, - [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 3, - [NF_IP_POST_ROUTING] = sizeof(struct ipt_standard) * 4, + [NF_INET_PRE_ROUTING] = 0, + [NF_INET_LOCAL_IN] = sizeof(struct ipt_standard), + [NF_INET_FORWARD] = sizeof(struct ipt_standard) * 2, + [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 3, + [NF_INET_POST_ROUTING] = sizeof(struct ipt_standard) * 4, }, }, .entries = { @@ -128,40 +128,40 @@ ipt_local_hook(unsigned int hook, return ret; } -static struct nf_hook_ops ipt_ops[] = { +static struct nf_hook_ops ipt_ops[] __read_mostly = { { .hook = ipt_route_hook, .owner = THIS_MODULE, .pf = PF_INET, - .hooknum = NF_IP_PRE_ROUTING, + .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP_PRI_MANGLE, }, { .hook = ipt_route_hook, .owner = THIS_MODULE, .pf = PF_INET, - .hooknum = NF_IP_LOCAL_IN, + .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_MANGLE, }, { .hook = ipt_route_hook, .owner = THIS_MODULE, .pf = PF_INET, - .hooknum = NF_IP_FORWARD, + .hooknum = NF_INET_FORWARD, .priority = NF_IP_PRI_MANGLE, }, { .hook = ipt_local_hook, .owner = THIS_MODULE, .pf = PF_INET, - .hooknum = NF_IP_LOCAL_OUT, + .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_MANGLE, }, { .hook = ipt_route_hook, .owner = THIS_MODULE, .pf = PF_INET, - .hooknum = NF_IP_POST_ROUTING, + .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_MANGLE, }, }; diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c index f8678651250f..dc34aa274533 100644 --- a/net/ipv4/netfilter/iptable_raw.c +++ b/net/ipv4/netfilter/iptable_raw.c @@ -7,7 +7,7 @@ #include <linux/netfilter_ipv4/ip_tables.h> #include <net/ip.h> -#define RAW_VALID_HOOKS ((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_OUT)) +#define RAW_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT)) static struct { @@ -21,12 +21,12 @@ static struct .num_entries = 3, .size = sizeof(struct ipt_standard) * 2 + sizeof(struct ipt_error), .hook_entry = { - [NF_IP_PRE_ROUTING] = 0, - [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) + [NF_INET_PRE_ROUTING] = 0, + [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) }, .underflow = { - [NF_IP_PRE_ROUTING] = 0, - [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) + [NF_INET_PRE_ROUTING] = 0, + [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) }, }, .entries = { @@ -74,18 +74,18 @@ ipt_local_hook(unsigned int hook, } /* 'raw' is the very first table. */ -static struct nf_hook_ops ipt_ops[] = { +static struct nf_hook_ops ipt_ops[] __read_mostly = { { .hook = ipt_hook, .pf = PF_INET, - .hooknum = NF_IP_PRE_ROUTING, + .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP_PRI_RAW, .owner = THIS_MODULE, }, { .hook = ipt_local_hook, .pf = PF_INET, - .hooknum = NF_IP_LOCAL_OUT, + .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_RAW, .owner = THIS_MODULE, }, diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 910dae732a0f..ac3d61d8026e 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -56,12 +56,6 @@ static int ipv4_print_tuple(struct seq_file *s, NIPQUAD(tuple->dst.u3.ip)); } -static int ipv4_print_conntrack(struct seq_file *s, - const struct nf_conn *conntrack) -{ - return 0; -} - /* Returns new sk_buff, or NULL */ static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user) { @@ -150,7 +144,7 @@ static unsigned int ipv4_conntrack_defrag(unsigned int hooknum, /* Gather fragments. */ if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { if (nf_ct_ipv4_gather_frags(skb, - hooknum == NF_IP_PRE_ROUTING ? + hooknum == NF_INET_PRE_ROUTING ? IP_DEFRAG_CONNTRACK_IN : IP_DEFRAG_CONNTRACK_OUT)) return NF_STOLEN; @@ -185,61 +179,61 @@ static unsigned int ipv4_conntrack_local(unsigned int hooknum, /* Connection tracking may drop packets, but never alters them, so make it the first hook. */ -static struct nf_hook_ops ipv4_conntrack_ops[] = { +static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = { { .hook = ipv4_conntrack_defrag, .owner = THIS_MODULE, .pf = PF_INET, - .hooknum = NF_IP_PRE_ROUTING, + .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP_PRI_CONNTRACK_DEFRAG, }, { .hook = ipv4_conntrack_in, .owner = THIS_MODULE, .pf = PF_INET, - .hooknum = NF_IP_PRE_ROUTING, + .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP_PRI_CONNTRACK, }, { .hook = ipv4_conntrack_defrag, .owner = THIS_MODULE, .pf = PF_INET, - .hooknum = NF_IP_LOCAL_OUT, + .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_CONNTRACK_DEFRAG, }, { .hook = ipv4_conntrack_local, .owner = THIS_MODULE, .pf = PF_INET, - .hooknum = NF_IP_LOCAL_OUT, + .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_CONNTRACK, }, { .hook = ipv4_conntrack_help, .owner = THIS_MODULE, .pf = PF_INET, - .hooknum = NF_IP_POST_ROUTING, + .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_CONNTRACK_HELPER, }, { .hook = ipv4_conntrack_help, .owner = THIS_MODULE, .pf = PF_INET, - .hooknum = NF_IP_LOCAL_IN, + .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_CONNTRACK_HELPER, }, { .hook = ipv4_confirm, .owner = THIS_MODULE, .pf = PF_INET, - .hooknum = NF_IP_POST_ROUTING, + .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_CONNTRACK_CONFIRM, }, { .hook = ipv4_confirm, .owner = THIS_MODULE, .pf = PF_INET, - .hooknum = NF_IP_LOCAL_IN, + .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_CONNTRACK_CONFIRM, }, }; @@ -363,10 +357,8 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len) static int ipv4_tuple_to_nlattr(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple) { - NLA_PUT(skb, CTA_IP_V4_SRC, sizeof(u_int32_t), - &tuple->src.u3.ip); - NLA_PUT(skb, CTA_IP_V4_DST, sizeof(u_int32_t), - &tuple->dst.u3.ip); + NLA_PUT_BE32(skb, CTA_IP_V4_SRC, tuple->src.u3.ip); + NLA_PUT_BE32(skb, CTA_IP_V4_DST, tuple->dst.u3.ip); return 0; nla_put_failure: @@ -384,8 +376,8 @@ static int ipv4_nlattr_to_tuple(struct nlattr *tb[], if (!tb[CTA_IP_V4_SRC] || !tb[CTA_IP_V4_DST]) return -EINVAL; - t->src.u3.ip = *(__be32 *)nla_data(tb[CTA_IP_V4_SRC]); - t->dst.u3.ip = *(__be32 *)nla_data(tb[CTA_IP_V4_DST]); + t->src.u3.ip = nla_get_be32(tb[CTA_IP_V4_SRC]); + t->dst.u3.ip = nla_get_be32(tb[CTA_IP_V4_DST]); return 0; } @@ -405,7 +397,6 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = { .pkt_to_tuple = ipv4_pkt_to_tuple, .invert_tuple = ipv4_invert_tuple, .print_tuple = ipv4_print_tuple, - .print_conntrack = ipv4_print_conntrack, .get_l4proto = ipv4_get_l4proto, #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) .tuple_to_nlattr = ipv4_tuple_to_nlattr, diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c index 741f3dfaa5a1..543c02b74c96 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c @@ -121,10 +121,7 @@ static int ct_seq_show(struct seq_file *s, void *v) ? (long)(ct->timeout.expires - jiffies)/HZ : 0) != 0) return -ENOSPC; - if (l3proto->print_conntrack(s, ct)) - return -ENOSPC; - - if (l4proto->print_conntrack(s, ct)) + if (l4proto->print_conntrack && l4proto->print_conntrack(s, ct)) return -ENOSPC; if (print_tuple(s, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index adcbaf6d4299..4004a04c5510 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -18,6 +18,7 @@ #include <net/netfilter/nf_conntrack_tuple.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_log.h> static unsigned long nf_ct_icmp_timeout __read_mostly = 30*HZ; @@ -73,13 +74,6 @@ static int icmp_print_tuple(struct seq_file *s, ntohs(tuple->src.u.icmp.id)); } -/* Print out the private part of the conntrack. */ -static int icmp_print_conntrack(struct seq_file *s, - const struct nf_conn *conntrack) -{ - return 0; -} - /* Returns verdict for packet, or -1 for invalid. */ static int icmp_packet(struct nf_conn *ct, const struct sk_buff *skb, @@ -128,7 +122,6 @@ static int icmp_new(struct nf_conn *conntrack, return 1; } -extern struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4; /* Returns conntrack if it dealt with ICMP, and filled in skb fields */ static int icmp_error_message(struct sk_buff *skb, @@ -195,7 +188,7 @@ icmp_error(struct sk_buff *skb, unsigned int dataoff, } /* See ip_conntrack_proto_tcp.c */ - if (nf_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING && + if (nf_conntrack_checksum && hooknum == NF_INET_PRE_ROUTING && nf_ip_checksum(skb, hooknum, dataoff, 0)) { if (LOG_INVALID(IPPROTO_ICMP)) nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, @@ -235,12 +228,9 @@ icmp_error(struct sk_buff *skb, unsigned int dataoff, static int icmp_tuple_to_nlattr(struct sk_buff *skb, const struct nf_conntrack_tuple *t) { - NLA_PUT(skb, CTA_PROTO_ICMP_ID, sizeof(u_int16_t), - &t->src.u.icmp.id); - NLA_PUT(skb, CTA_PROTO_ICMP_TYPE, sizeof(u_int8_t), - &t->dst.u.icmp.type); - NLA_PUT(skb, CTA_PROTO_ICMP_CODE, sizeof(u_int8_t), - &t->dst.u.icmp.code); + NLA_PUT_BE16(skb, CTA_PROTO_ICMP_ID, t->src.u.icmp.id); + NLA_PUT_U8(skb, CTA_PROTO_ICMP_TYPE, t->dst.u.icmp.type); + NLA_PUT_U8(skb, CTA_PROTO_ICMP_CODE, t->dst.u.icmp.code); return 0; @@ -262,12 +252,9 @@ static int icmp_nlattr_to_tuple(struct nlattr *tb[], || !tb[CTA_PROTO_ICMP_ID]) return -EINVAL; - tuple->dst.u.icmp.type = - *(u_int8_t *)nla_data(tb[CTA_PROTO_ICMP_TYPE]); - tuple->dst.u.icmp.code = - *(u_int8_t *)nla_data(tb[CTA_PROTO_ICMP_CODE]); - tuple->src.u.icmp.id = - *(__be16 *)nla_data(tb[CTA_PROTO_ICMP_ID]); + tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMP_TYPE]); + tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMP_CODE]); + tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMP_ID]); if (tuple->dst.u.icmp.type >= sizeof(invmap) || !invmap[tuple->dst.u.icmp.type]) @@ -315,7 +302,6 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly = .pkt_to_tuple = icmp_pkt_to_tuple, .invert_tuple = icmp_invert_tuple, .print_tuple = icmp_print_tuple, - .print_conntrack = icmp_print_conntrack, .packet = icmp_packet, .new = icmp_new, .error = icmp_error, diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c index 86b465b176ba..e53ae1ef8f5e 100644 --- a/net/ipv4/netfilter/nf_nat_core.c +++ b/net/ipv4/netfilter/nf_nat_core.c @@ -33,27 +33,28 @@ static DEFINE_RWLOCK(nf_nat_lock); -static struct nf_conntrack_l3proto *l3proto = NULL; +static struct nf_conntrack_l3proto *l3proto __read_mostly; /* Calculated at init based on memory size */ -static unsigned int nf_nat_htable_size; +static unsigned int nf_nat_htable_size __read_mostly; static int nf_nat_vmalloced; -static struct hlist_head *bysource; +static struct hlist_head *bysource __read_mostly; #define MAX_IP_NAT_PROTO 256 -static struct nf_nat_protocol *nf_nat_protos[MAX_IP_NAT_PROTO]; +static const struct nf_nat_protocol *nf_nat_protos[MAX_IP_NAT_PROTO] + __read_mostly; -static inline struct nf_nat_protocol * +static inline const struct nf_nat_protocol * __nf_nat_proto_find(u_int8_t protonum) { return rcu_dereference(nf_nat_protos[protonum]); } -struct nf_nat_protocol * +const struct nf_nat_protocol * nf_nat_proto_find_get(u_int8_t protonum) { - struct nf_nat_protocol *p; + const struct nf_nat_protocol *p; rcu_read_lock(); p = __nf_nat_proto_find(protonum); @@ -66,7 +67,7 @@ nf_nat_proto_find_get(u_int8_t protonum) EXPORT_SYMBOL_GPL(nf_nat_proto_find_get); void -nf_nat_proto_put(struct nf_nat_protocol *p) +nf_nat_proto_put(const struct nf_nat_protocol *p) { module_put(p->me); } @@ -76,10 +77,13 @@ EXPORT_SYMBOL_GPL(nf_nat_proto_put); static inline unsigned int hash_by_src(const struct nf_conntrack_tuple *tuple) { + unsigned int hash; + /* Original src, to ensure we map it consistently if poss. */ - return jhash_3words((__force u32)tuple->src.u3.ip, + hash = jhash_3words((__force u32)tuple->src.u3.ip, (__force u32)tuple->src.u.all, - tuple->dst.protonum, 0) % nf_nat_htable_size; + tuple->dst.protonum, 0); + return ((u64)hash * nf_nat_htable_size) >> 32; } /* Is this tuple already taken? (not by us) */ @@ -105,7 +109,7 @@ static int in_range(const struct nf_conntrack_tuple *tuple, const struct nf_nat_range *range) { - struct nf_nat_protocol *proto; + const struct nf_nat_protocol *proto; int ret = 0; /* If we are supposed to map IPs, then we must be in the @@ -210,12 +214,13 @@ find_best_ips_proto(struct nf_conntrack_tuple *tuple, maxip = ntohl(range->max_ip); j = jhash_2words((__force u32)tuple->src.u3.ip, (__force u32)tuple->dst.u3.ip, 0); - *var_ipp = htonl(minip + j % (maxip - minip + 1)); + j = ((u64)j * (maxip - minip + 1)) >> 32; + *var_ipp = htonl(minip + j); } -/* Manipulate the tuple into the range given. For NF_IP_POST_ROUTING, - * we change the source to map into the range. For NF_IP_PRE_ROUTING - * and NF_IP_LOCAL_OUT, we change the destination to map into the +/* Manipulate the tuple into the range given. For NF_INET_POST_ROUTING, + * we change the source to map into the range. For NF_INET_PRE_ROUTING + * and NF_INET_LOCAL_OUT, we change the destination to map into the * range. It might not be possible to get a unique tuple, but we try. * At worst (or if we race), we will end up with a final duplicate in * __ip_conntrack_confirm and drop the packet. */ @@ -226,7 +231,7 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple, struct nf_conn *ct, enum nf_nat_manip_type maniptype) { - struct nf_nat_protocol *proto; + const struct nf_nat_protocol *proto; /* 1) If this srcip/proto/src-proto-part is currently mapped, and that same mapping gives a unique tuple within the given @@ -276,12 +281,11 @@ out: unsigned int nf_nat_setup_info(struct nf_conn *ct, const struct nf_nat_range *range, - unsigned int hooknum) + enum nf_nat_manip_type maniptype) { struct nf_conntrack_tuple curr_tuple, new_tuple; struct nf_conn_nat *nat; int have_to_hash = !(ct->status & IPS_NAT_DONE_MASK); - enum nf_nat_manip_type maniptype = HOOK2MANIP(hooknum); /* nat helper or nfctnetlink also setup binding */ nat = nfct_nat(ct); @@ -293,10 +297,8 @@ nf_nat_setup_info(struct nf_conn *ct, } } - NF_CT_ASSERT(hooknum == NF_IP_PRE_ROUTING || - hooknum == NF_IP_POST_ROUTING || - hooknum == NF_IP_LOCAL_IN || - hooknum == NF_IP_LOCAL_OUT); + NF_CT_ASSERT(maniptype == IP_NAT_MANIP_SRC || + maniptype == IP_NAT_MANIP_DST); BUG_ON(nf_nat_initialized(ct, maniptype)); /* What we've got will look like inverse of reply. Normally @@ -355,7 +357,7 @@ manip_pkt(u_int16_t proto, enum nf_nat_manip_type maniptype) { struct iphdr *iph; - struct nf_nat_protocol *p; + const struct nf_nat_protocol *p; if (!skb_make_writable(skb, iphdroff + sizeof(*iph))) return 0; @@ -372,10 +374,10 @@ manip_pkt(u_int16_t proto, iph = (void *)skb->data + iphdroff; if (maniptype == IP_NAT_MANIP_SRC) { - nf_csum_replace4(&iph->check, iph->saddr, target->src.u3.ip); + csum_replace4(&iph->check, iph->saddr, target->src.u3.ip); iph->saddr = target->src.u3.ip; } else { - nf_csum_replace4(&iph->check, iph->daddr, target->dst.u3.ip); + csum_replace4(&iph->check, iph->daddr, target->dst.u3.ip); iph->daddr = target->dst.u3.ip; } return 1; @@ -515,7 +517,7 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct, EXPORT_SYMBOL_GPL(nf_nat_icmp_reply_translation); /* Protocol registration. */ -int nf_nat_protocol_register(struct nf_nat_protocol *proto) +int nf_nat_protocol_register(const struct nf_nat_protocol *proto) { int ret = 0; @@ -532,7 +534,7 @@ int nf_nat_protocol_register(struct nf_nat_protocol *proto) EXPORT_SYMBOL(nf_nat_protocol_register); /* Noone stores the protocol anywhere; simply delete it. */ -void nf_nat_protocol_unregister(struct nf_nat_protocol *proto) +void nf_nat_protocol_unregister(const struct nf_nat_protocol *proto) { write_lock_bh(&nf_nat_lock); rcu_assign_pointer(nf_nat_protos[proto->protonum], @@ -547,10 +549,8 @@ int nf_nat_port_range_to_nlattr(struct sk_buff *skb, const struct nf_nat_range *range) { - NLA_PUT(skb, CTA_PROTONAT_PORT_MIN, sizeof(__be16), - &range->min.tcp.port); - NLA_PUT(skb, CTA_PROTONAT_PORT_MAX, sizeof(__be16), - &range->max.tcp.port); + NLA_PUT_BE16(skb, CTA_PROTONAT_PORT_MIN, range->min.tcp.port); + NLA_PUT_BE16(skb, CTA_PROTONAT_PORT_MAX, range->max.tcp.port); return 0; @@ -568,8 +568,7 @@ nf_nat_port_nlattr_to_range(struct nlattr *tb[], struct nf_nat_range *range) if (tb[CTA_PROTONAT_PORT_MIN]) { ret = 1; - range->min.tcp.port = - *(__be16 *)nla_data(tb[CTA_PROTONAT_PORT_MIN]); + range->min.tcp.port = nla_get_be16(tb[CTA_PROTONAT_PORT_MIN]); } if (!tb[CTA_PROTONAT_PORT_MAX]) { @@ -577,8 +576,7 @@ nf_nat_port_nlattr_to_range(struct nlattr *tb[], struct nf_nat_range *range) range->max.tcp.port = range->min.tcp.port; } else { ret = 1; - range->max.tcp.port = - *(__be16 *)nla_data(tb[CTA_PROTONAT_PORT_MAX]); + range->max.tcp.port = nla_get_be16(tb[CTA_PROTONAT_PORT_MAX]); } return ret; diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c index 93e18ef114f2..a121989fdad7 100644 --- a/net/ipv4/netfilter/nf_nat_h323.c +++ b/net/ipv4/netfilter/nf_nat_h323.c @@ -76,7 +76,7 @@ static int set_addr(struct sk_buff *skb, static int set_h225_addr(struct sk_buff *skb, unsigned char **data, int dataoff, TransportAddress *taddr, - union nf_conntrack_address *addr, __be16 port) + union nf_inet_addr *addr, __be16 port) { return set_addr(skb, data, dataoff, taddr->ipAddress.ip, addr->ip, port); @@ -86,7 +86,7 @@ static int set_h225_addr(struct sk_buff *skb, static int set_h245_addr(struct sk_buff *skb, unsigned char **data, int dataoff, H245_TransportAddress *taddr, - union nf_conntrack_address *addr, __be16 port) + union nf_inet_addr *addr, __be16 port) { return set_addr(skb, data, dataoff, taddr->unicastAddress.iPAddress.network, @@ -103,7 +103,7 @@ static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct, int dir = CTINFO2DIR(ctinfo); int i; __be16 port; - union nf_conntrack_address addr; + union nf_inet_addr addr; for (i = 0; i < count; i++) { if (get_h225_addr(ct, *data, &taddr[i], &addr, &port)) { @@ -155,7 +155,7 @@ static int set_ras_addr(struct sk_buff *skb, struct nf_conn *ct, int dir = CTINFO2DIR(ctinfo); int i; __be16 port; - union nf_conntrack_address addr; + union nf_inet_addr addr; for (i = 0; i < count; i++) { if (get_h225_addr(ct, *data, &taddr[i], &addr, &port) && @@ -389,18 +389,14 @@ static void ip_nat_q931_expect(struct nf_conn *new, /* Change src to where master sends to */ range.flags = IP_NAT_RANGE_MAP_IPS; range.min_ip = range.max_ip = new->tuplehash[!this->dir].tuple.src.u3.ip; - - /* hook doesn't matter, but it has to do source manip */ - nf_nat_setup_info(new, &range, NF_IP_POST_ROUTING); + nf_nat_setup_info(new, &range, IP_NAT_MANIP_SRC); /* For DST manip, map port here to where it's expected. */ range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED); range.min = range.max = this->saved_proto; range.min_ip = range.max_ip = new->master->tuplehash[!this->dir].tuple.src.u3.ip; - - /* hook doesn't matter, but it has to do destination manip */ - nf_nat_setup_info(new, &range, NF_IP_PRE_ROUTING); + nf_nat_setup_info(new, &range, IP_NAT_MANIP_DST); } /****************************************************************************/ @@ -412,7 +408,7 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct, struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; int dir = CTINFO2DIR(ctinfo); u_int16_t nated_port = ntohs(port); - union nf_conntrack_address addr; + union nf_inet_addr addr; /* Set expectations for NAT */ exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port; @@ -479,17 +475,13 @@ static void ip_nat_callforwarding_expect(struct nf_conn *new, /* Change src to where master sends to */ range.flags = IP_NAT_RANGE_MAP_IPS; range.min_ip = range.max_ip = new->tuplehash[!this->dir].tuple.src.u3.ip; - - /* hook doesn't matter, but it has to do source manip */ - nf_nat_setup_info(new, &range, NF_IP_POST_ROUTING); + nf_nat_setup_info(new, &range, IP_NAT_MANIP_SRC); /* For DST manip, map port here to where it's expected. */ range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED); range.min = range.max = this->saved_proto; range.min_ip = range.max_ip = this->saved_ip; - - /* hook doesn't matter, but it has to do destination manip */ - nf_nat_setup_info(new, &range, NF_IP_PRE_ROUTING); + nf_nat_setup_info(new, &range, IP_NAT_MANIP_DST); } /****************************************************************************/ diff --git a/net/ipv4/netfilter/nf_nat_helper.c b/net/ipv4/netfilter/nf_nat_helper.c index 8718da00ef2a..4c0232842e75 100644 --- a/net/ipv4/netfilter/nf_nat_helper.c +++ b/net/ipv4/netfilter/nf_nat_helper.c @@ -20,6 +20,7 @@ #include <linux/netfilter_ipv4.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_expect.h> #include <net/netfilter/nf_nat.h> #include <net/netfilter/nf_nat_protocol.h> @@ -180,8 +181,8 @@ nf_nat_mangle_tcp_packet(struct sk_buff *skb, datalen, 0)); } } else - nf_proto_csum_replace2(&tcph->check, skb, - htons(oldlen), htons(datalen), 1); + inet_proto_csum_replace2(&tcph->check, skb, + htons(oldlen), htons(datalen), 1); if (rep_len != match_len) { set_bit(IPS_SEQ_ADJUST_BIT, &ct->status); @@ -191,6 +192,8 @@ nf_nat_mangle_tcp_packet(struct sk_buff *skb, /* Tell TCP window tracking about seq change */ nf_conntrack_tcp_update(skb, ip_hdrlen(skb), ct, CTINFO2DIR(ctinfo)); + + nf_conntrack_event_cache(IPCT_NATSEQADJ, skb); } return 1; } @@ -270,8 +273,8 @@ nf_nat_mangle_udp_packet(struct sk_buff *skb, udph->check = CSUM_MANGLED_0; } } else - nf_proto_csum_replace2(&udph->check, skb, - htons(oldlen), htons(datalen), 1); + inet_proto_csum_replace2(&udph->check, skb, + htons(oldlen), htons(datalen), 1); return 1; } @@ -310,10 +313,10 @@ sack_adjust(struct sk_buff *skb, ntohl(sack->start_seq), new_start_seq, ntohl(sack->end_seq), new_end_seq); - nf_proto_csum_replace4(&tcph->check, skb, - sack->start_seq, new_start_seq, 0); - nf_proto_csum_replace4(&tcph->check, skb, - sack->end_seq, new_end_seq, 0); + inet_proto_csum_replace4(&tcph->check, skb, + sack->start_seq, new_start_seq, 0); + inet_proto_csum_replace4(&tcph->check, skb, + sack->end_seq, new_end_seq, 0); sack->start_seq = new_start_seq; sack->end_seq = new_end_seq; sackoff += sizeof(*sack); @@ -397,8 +400,8 @@ nf_nat_seq_adjust(struct sk_buff *skb, else newack = htonl(ntohl(tcph->ack_seq) - other_way->offset_before); - nf_proto_csum_replace4(&tcph->check, skb, tcph->seq, newseq, 0); - nf_proto_csum_replace4(&tcph->check, skb, tcph->ack_seq, newack, 0); + inet_proto_csum_replace4(&tcph->check, skb, tcph->seq, newseq, 0); + inet_proto_csum_replace4(&tcph->check, skb, tcph->ack_seq, newack, 0); pr_debug("Adjusting sequence number from %u->%u, ack from %u->%u\n", ntohl(tcph->seq), ntohl(newseq), ntohl(tcph->ack_seq), @@ -430,15 +433,13 @@ void nf_nat_follow_master(struct nf_conn *ct, range.flags = IP_NAT_RANGE_MAP_IPS; range.min_ip = range.max_ip = ct->master->tuplehash[!exp->dir].tuple.dst.u3.ip; - /* hook doesn't matter, but it has to do source manip */ - nf_nat_setup_info(ct, &range, NF_IP_POST_ROUTING); + nf_nat_setup_info(ct, &range, IP_NAT_MANIP_SRC); /* For DST manip, map port here to where it's expected. */ range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED); range.min = range.max = exp->saved_proto; range.min_ip = range.max_ip = ct->master->tuplehash[!exp->dir].tuple.src.u3.ip; - /* hook doesn't matter, but it has to do destination manip */ - nf_nat_setup_info(ct, &range, NF_IP_PRE_ROUTING); + nf_nat_setup_info(ct, &range, IP_NAT_MANIP_DST); } EXPORT_SYMBOL(nf_nat_follow_master); diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c index 6817e7995f35..e63b944a2ebb 100644 --- a/net/ipv4/netfilter/nf_nat_pptp.c +++ b/net/ipv4/netfilter/nf_nat_pptp.c @@ -93,8 +93,7 @@ static void pptp_nat_expected(struct nf_conn *ct, range.flags |= IP_NAT_RANGE_PROTO_SPECIFIED; range.min = range.max = exp->saved_proto; } - /* hook doesn't matter, but it has to do source manip */ - nf_nat_setup_info(ct, &range, NF_IP_POST_ROUTING); + nf_nat_setup_info(ct, &range, IP_NAT_MANIP_SRC); /* For DST manip, map port here to where it's expected. */ range.flags = IP_NAT_RANGE_MAP_IPS; @@ -104,8 +103,7 @@ static void pptp_nat_expected(struct nf_conn *ct, range.flags |= IP_NAT_RANGE_PROTO_SPECIFIED; range.min = range.max = exp->saved_proto; } - /* hook doesn't matter, but it has to do destination manip */ - nf_nat_setup_info(ct, &range, NF_IP_PRE_ROUTING); + nf_nat_setup_info(ct, &range, IP_NAT_MANIP_DST); } /* outbound packets == from PNS to PAC */ diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c index b820f9960356..9fa272e73113 100644 --- a/net/ipv4/netfilter/nf_nat_proto_gre.c +++ b/net/ipv4/netfilter/nf_nat_proto_gre.c @@ -135,9 +135,10 @@ gre_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, return 1; } -static struct nf_nat_protocol gre __read_mostly = { +static const struct nf_nat_protocol gre = { .name = "GRE", .protonum = IPPROTO_GRE, + .me = THIS_MODULE, .manip_pkt = gre_manip_pkt, .in_range = gre_in_range, .unique_tuple = gre_unique_tuple, diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c b/net/ipv4/netfilter/nf_nat_proto_icmp.c index b9fc724388fc..a0e44c953cb6 100644 --- a/net/ipv4/netfilter/nf_nat_proto_icmp.c +++ b/net/ipv4/netfilter/nf_nat_proto_icmp.c @@ -65,13 +65,13 @@ icmp_manip_pkt(struct sk_buff *skb, return 0; hdr = (struct icmphdr *)(skb->data + hdroff); - nf_proto_csum_replace2(&hdr->checksum, skb, - hdr->un.echo.id, tuple->src.u.icmp.id, 0); + inet_proto_csum_replace2(&hdr->checksum, skb, + hdr->un.echo.id, tuple->src.u.icmp.id, 0); hdr->un.echo.id = tuple->src.u.icmp.id; return 1; } -struct nf_nat_protocol nf_nat_protocol_icmp = { +const struct nf_nat_protocol nf_nat_protocol_icmp = { .name = "ICMP", .protonum = IPPROTO_ICMP, .me = THIS_MODULE, diff --git a/net/ipv4/netfilter/nf_nat_proto_tcp.c b/net/ipv4/netfilter/nf_nat_proto_tcp.c index 6bab2e184455..da23e9fbe679 100644 --- a/net/ipv4/netfilter/nf_nat_proto_tcp.c +++ b/net/ipv4/netfilter/nf_nat_proto_tcp.c @@ -132,12 +132,12 @@ tcp_manip_pkt(struct sk_buff *skb, if (hdrsize < sizeof(*hdr)) return 1; - nf_proto_csum_replace4(&hdr->check, skb, oldip, newip, 1); - nf_proto_csum_replace2(&hdr->check, skb, oldport, newport, 0); + inet_proto_csum_replace4(&hdr->check, skb, oldip, newip, 1); + inet_proto_csum_replace2(&hdr->check, skb, oldport, newport, 0); return 1; } -struct nf_nat_protocol nf_nat_protocol_tcp = { +const struct nf_nat_protocol nf_nat_protocol_tcp = { .name = "TCP", .protonum = IPPROTO_TCP, .me = THIS_MODULE, diff --git a/net/ipv4/netfilter/nf_nat_proto_udp.c b/net/ipv4/netfilter/nf_nat_proto_udp.c index cbf1a61e2908..10df4db078af 100644 --- a/net/ipv4/netfilter/nf_nat_proto_udp.c +++ b/net/ipv4/netfilter/nf_nat_proto_udp.c @@ -117,9 +117,9 @@ udp_manip_pkt(struct sk_buff *skb, portptr = &hdr->dest; } if (hdr->check || skb->ip_summed == CHECKSUM_PARTIAL) { - nf_proto_csum_replace4(&hdr->check, skb, oldip, newip, 1); - nf_proto_csum_replace2(&hdr->check, skb, *portptr, newport, - 0); + inet_proto_csum_replace4(&hdr->check, skb, oldip, newip, 1); + inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport, + 0); if (!hdr->check) hdr->check = CSUM_MANGLED_0; } @@ -127,7 +127,7 @@ udp_manip_pkt(struct sk_buff *skb, return 1; } -struct nf_nat_protocol nf_nat_protocol_udp = { +const struct nf_nat_protocol nf_nat_protocol_udp = { .name = "UDP", .protonum = IPPROTO_UDP, .me = THIS_MODULE, diff --git a/net/ipv4/netfilter/nf_nat_proto_unknown.c b/net/ipv4/netfilter/nf_nat_proto_unknown.c index cfd2742e9706..a26efeb073cb 100644 --- a/net/ipv4/netfilter/nf_nat_proto_unknown.c +++ b/net/ipv4/netfilter/nf_nat_proto_unknown.c @@ -45,7 +45,7 @@ unknown_manip_pkt(struct sk_buff *skb, return 1; } -struct nf_nat_protocol nf_nat_unknown_protocol = { +const struct nf_nat_protocol nf_nat_unknown_protocol = { .name = "unknown", /* .me isn't set: getting a ref to this cannot fail. */ .manip_pkt = unknown_manip_pkt, diff --git a/net/ipv4/netfilter/nf_nat_rule.c b/net/ipv4/netfilter/nf_nat_rule.c index 46b25ab5f78b..519182269e76 100644 --- a/net/ipv4/netfilter/nf_nat_rule.c +++ b/net/ipv4/netfilter/nf_nat_rule.c @@ -24,7 +24,9 @@ #include <net/netfilter/nf_nat_core.h> #include <net/netfilter/nf_nat_rule.h> -#define NAT_VALID_HOOKS ((1<<NF_IP_PRE_ROUTING) | (1<<NF_IP_POST_ROUTING) | (1<<NF_IP_LOCAL_OUT)) +#define NAT_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | \ + (1 << NF_INET_POST_ROUTING) | \ + (1 << NF_INET_LOCAL_OUT)) static struct { @@ -38,14 +40,14 @@ static struct .num_entries = 4, .size = sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error), .hook_entry = { - [NF_IP_PRE_ROUTING] = 0, - [NF_IP_POST_ROUTING] = sizeof(struct ipt_standard), - [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 2 + [NF_INET_PRE_ROUTING] = 0, + [NF_INET_POST_ROUTING] = sizeof(struct ipt_standard), + [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 2 }, .underflow = { - [NF_IP_PRE_ROUTING] = 0, - [NF_IP_POST_ROUTING] = sizeof(struct ipt_standard), - [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 2 + [NF_INET_PRE_ROUTING] = 0, + [NF_INET_POST_ROUTING] = sizeof(struct ipt_standard), + [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 2 }, }, .entries = { @@ -76,7 +78,7 @@ static unsigned int ipt_snat_target(struct sk_buff *skb, enum ip_conntrack_info ctinfo; const struct nf_nat_multi_range_compat *mr = targinfo; - NF_CT_ASSERT(hooknum == NF_IP_POST_ROUTING); + NF_CT_ASSERT(hooknum == NF_INET_POST_ROUTING); ct = nf_ct_get(skb, &ctinfo); @@ -85,7 +87,7 @@ static unsigned int ipt_snat_target(struct sk_buff *skb, ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY)); NF_CT_ASSERT(out); - return nf_nat_setup_info(ct, &mr->range[0], hooknum); + return nf_nat_setup_info(ct, &mr->range[0], IP_NAT_MANIP_SRC); } /* Before 2.6.11 we did implicit source NAT if required. Warn about change. */ @@ -95,7 +97,7 @@ static void warn_if_extra_mangle(__be32 dstip, __be32 srcip) struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dstip } } }; struct rtable *rt; - if (ip_route_output_key(&rt, &fl) != 0) + if (ip_route_output_key(&init_net, &rt, &fl) != 0) return; if (rt->rt_src != srcip && !warned) { @@ -118,20 +120,20 @@ static unsigned int ipt_dnat_target(struct sk_buff *skb, enum ip_conntrack_info ctinfo; const struct nf_nat_multi_range_compat *mr = targinfo; - NF_CT_ASSERT(hooknum == NF_IP_PRE_ROUTING || - hooknum == NF_IP_LOCAL_OUT); + NF_CT_ASSERT(hooknum == NF_INET_PRE_ROUTING || + hooknum == NF_INET_LOCAL_OUT); ct = nf_ct_get(skb, &ctinfo); /* Connection must be valid and new. */ NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)); - if (hooknum == NF_IP_LOCAL_OUT && + if (hooknum == NF_INET_LOCAL_OUT && mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) warn_if_extra_mangle(ip_hdr(skb)->daddr, mr->range[0].min_ip); - return nf_nat_setup_info(ct, &mr->range[0], hooknum); + return nf_nat_setup_info(ct, &mr->range[0], IP_NAT_MANIP_DST); } static bool ipt_snat_checkentry(const char *tablename, @@ -182,7 +184,7 @@ alloc_null_binding(struct nf_conn *ct, unsigned int hooknum) pr_debug("Allocating NULL binding for %p (%u.%u.%u.%u)\n", ct, NIPQUAD(ip)); - return nf_nat_setup_info(ct, &range, hooknum); + return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum)); } unsigned int @@ -201,7 +203,7 @@ alloc_null_binding_confirmed(struct nf_conn *ct, unsigned int hooknum) pr_debug("Allocating NULL binding for confirmed %p (%u.%u.%u.%u)\n", ct, NIPQUAD(ip)); - return nf_nat_setup_info(ct, &range, hooknum); + return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum)); } int nf_nat_rule_find(struct sk_buff *skb, @@ -227,7 +229,7 @@ static struct xt_target ipt_snat_reg __read_mostly = { .target = ipt_snat_target, .targetsize = sizeof(struct nf_nat_multi_range_compat), .table = "nat", - .hooks = 1 << NF_IP_POST_ROUTING, + .hooks = 1 << NF_INET_POST_ROUTING, .checkentry = ipt_snat_checkentry, .family = AF_INET, }; @@ -237,7 +239,7 @@ static struct xt_target ipt_dnat_reg __read_mostly = { .target = ipt_dnat_target, .targetsize = sizeof(struct nf_nat_multi_range_compat), .table = "nat", - .hooks = (1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_OUT), + .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT), .checkentry = ipt_dnat_checkentry, .family = AF_INET, }; diff --git a/net/ipv4/netfilter/nf_nat_sip.c b/net/ipv4/netfilter/nf_nat_sip.c index 8996ccb757db..606a170bf4ca 100644 --- a/net/ipv4/netfilter/nf_nat_sip.c +++ b/net/ipv4/netfilter/nf_nat_sip.c @@ -228,15 +228,13 @@ static void ip_nat_sdp_expect(struct nf_conn *ct, range.flags = IP_NAT_RANGE_MAP_IPS; range.min_ip = range.max_ip = ct->master->tuplehash[!exp->dir].tuple.dst.u3.ip; - /* hook doesn't matter, but it has to do source manip */ - nf_nat_setup_info(ct, &range, NF_IP_POST_ROUTING); + nf_nat_setup_info(ct, &range, IP_NAT_MANIP_SRC); /* For DST manip, map port here to where it's expected. */ range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED); range.min = range.max = exp->saved_proto; range.min_ip = range.max_ip = exp->saved_ip; - /* hook doesn't matter, but it has to do destination manip */ - nf_nat_setup_info(ct, &range, NF_IP_PRE_ROUTING); + nf_nat_setup_info(ct, &range, IP_NAT_MANIP_DST); } /* So, this packet has hit the connection tracking matching code. diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c index 03709d6b4b06..07f2a49926d4 100644 --- a/net/ipv4/netfilter/nf_nat_snmp_basic.c +++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c @@ -60,7 +60,7 @@ MODULE_ALIAS("ip_nat_snmp_basic"); #define SNMP_PORT 161 #define SNMP_TRAP_PORT 162 -#define NOCT1(n) (*(u8 *)n) +#define NOCT1(n) (*(u8 *)(n)) static int debug; static DEFINE_SPINLOCK(snmp_lock); diff --git a/net/ipv4/netfilter/nf_nat_standalone.c b/net/ipv4/netfilter/nf_nat_standalone.c index 7db76ea9af91..99b2c788d5a8 100644 --- a/net/ipv4/netfilter/nf_nat_standalone.c +++ b/net/ipv4/netfilter/nf_nat_standalone.c @@ -137,7 +137,7 @@ nf_nat_fn(unsigned int hooknum, if (unlikely(nf_ct_is_confirmed(ct))) /* NAT module was loaded late */ ret = alloc_null_binding_confirmed(ct, hooknum); - else if (hooknum == NF_IP_LOCAL_IN) + else if (hooknum == NF_INET_LOCAL_IN) /* LOCAL_IN hook doesn't have a chain! */ ret = alloc_null_binding(ct, hooknum); else @@ -273,13 +273,13 @@ nf_nat_adjust(unsigned int hooknum, /* We must be after connection tracking and before packet filtering. */ -static struct nf_hook_ops nf_nat_ops[] = { +static struct nf_hook_ops nf_nat_ops[] __read_mostly = { /* Before packet filtering, change destination */ { .hook = nf_nat_in, .owner = THIS_MODULE, .pf = PF_INET, - .hooknum = NF_IP_PRE_ROUTING, + .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP_PRI_NAT_DST, }, /* After packet filtering, change source */ @@ -287,7 +287,7 @@ static struct nf_hook_ops nf_nat_ops[] = { .hook = nf_nat_out, .owner = THIS_MODULE, .pf = PF_INET, - .hooknum = NF_IP_POST_ROUTING, + .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_NAT_SRC, }, /* After conntrack, adjust sequence number */ @@ -295,7 +295,7 @@ static struct nf_hook_ops nf_nat_ops[] = { .hook = nf_nat_adjust, .owner = THIS_MODULE, .pf = PF_INET, - .hooknum = NF_IP_POST_ROUTING, + .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_NAT_SEQ_ADJUST, }, /* Before packet filtering, change destination */ @@ -303,7 +303,7 @@ static struct nf_hook_ops nf_nat_ops[] = { .hook = nf_nat_local_fn, .owner = THIS_MODULE, .pf = PF_INET, - .hooknum = NF_IP_LOCAL_OUT, + .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_NAT_DST, }, /* After packet filtering, change source */ @@ -311,7 +311,7 @@ static struct nf_hook_ops nf_nat_ops[] = { .hook = nf_nat_fn, .owner = THIS_MODULE, .pf = PF_INET, - .hooknum = NF_IP_LOCAL_IN, + .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_NAT_SRC, }, /* After conntrack, adjust sequence number */ @@ -319,7 +319,7 @@ static struct nf_hook_ops nf_nat_ops[] = { .hook = nf_nat_adjust, .owner = THIS_MODULE, .pf = PF_INET, - .hooknum = NF_IP_LOCAL_IN, + .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_NAT_SEQ_ADJUST, }, }; @@ -332,7 +332,7 @@ static int __init nf_nat_standalone_init(void) #ifdef CONFIG_XFRM BUG_ON(ip_nat_decode_session != NULL); - ip_nat_decode_session = nat_decode_session; + rcu_assign_pointer(ip_nat_decode_session, nat_decode_session); #endif ret = nf_nat_rule_init(); if (ret < 0) { @@ -350,7 +350,7 @@ static int __init nf_nat_standalone_init(void) nf_nat_rule_cleanup(); cleanup_decode_session: #ifdef CONFIG_XFRM - ip_nat_decode_session = NULL; + rcu_assign_pointer(ip_nat_decode_session, NULL); synchronize_net(); #endif return ret; @@ -361,7 +361,7 @@ static void __exit nf_nat_standalone_fini(void) nf_unregister_hooks(nf_nat_ops, ARRAY_SIZE(nf_nat_ops)); nf_nat_rule_cleanup(); #ifdef CONFIG_XFRM - ip_nat_decode_session = NULL; + rcu_assign_pointer(ip_nat_decode_session, NULL); synchronize_net(); #endif /* Conntrack caches are unregistered in nf_conntrack_cleanup */ diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index ce34b281803f..d63474c6b400 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -53,14 +53,16 @@ static int sockstat_seq_show(struct seq_file *seq, void *v) { socket_seq_show(seq); seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %d\n", - sock_prot_inuse(&tcp_prot), atomic_read(&tcp_orphan_count), + sock_prot_inuse_get(&tcp_prot), + atomic_read(&tcp_orphan_count), tcp_death_row.tw_count, atomic_read(&tcp_sockets_allocated), atomic_read(&tcp_memory_allocated)); - seq_printf(seq, "UDP: inuse %d\n", sock_prot_inuse(&udp_prot)); - seq_printf(seq, "UDPLITE: inuse %d\n", sock_prot_inuse(&udplite_prot)); - seq_printf(seq, "RAW: inuse %d\n", sock_prot_inuse(&raw_prot)); + seq_printf(seq, "UDP: inuse %d mem %d\n", sock_prot_inuse_get(&udp_prot), + atomic_read(&udp_memory_allocated)); + seq_printf(seq, "UDPLITE: inuse %d\n", sock_prot_inuse_get(&udplite_prot)); + seq_printf(seq, "RAW: inuse %d\n", sock_prot_inuse_get(&raw_prot)); seq_printf(seq, "FRAG: inuse %d memory %d\n", - ip_frag_nqueues(), ip_frag_mem()); + ip_frag_nqueues(&init_net), ip_frag_mem(&init_net)); return 0; } @@ -309,7 +311,8 @@ static int snmp_seq_show(struct seq_file *seq, void *v) seq_printf(seq, " %s", snmp4_ipstats_list[i].name); seq_printf(seq, "\nIp: %d %d", - IPV4_DEVCONF_ALL(FORWARDING) ? 1 : 2, sysctl_ip_default_ttl); + IPV4_DEVCONF_ALL(&init_net, FORWARDING) ? 1 : 2, + sysctl_ip_default_ttl); for (i = 0; snmp4_ipstats_list[i].name != NULL; i++) seq_printf(seq, " %lu", diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index e7050f8eabeb..85c08696abbe 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -80,38 +80,51 @@ #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> -struct hlist_head raw_v4_htable[RAWV4_HTABLE_SIZE]; -DEFINE_RWLOCK(raw_v4_lock); +static struct raw_hashinfo raw_v4_hashinfo = { + .lock = __RW_LOCK_UNLOCKED(), +}; -static void raw_v4_hash(struct sock *sk) +void raw_hash_sk(struct sock *sk, struct raw_hashinfo *h) { - struct hlist_head *head = &raw_v4_htable[inet_sk(sk)->num & - (RAWV4_HTABLE_SIZE - 1)]; + struct hlist_head *head; + + head = &h->ht[inet_sk(sk)->num & (RAW_HTABLE_SIZE - 1)]; - write_lock_bh(&raw_v4_lock); + write_lock_bh(&h->lock); sk_add_node(sk, head); - sock_prot_inc_use(sk->sk_prot); - write_unlock_bh(&raw_v4_lock); + sock_prot_inuse_add(sk->sk_prot, 1); + write_unlock_bh(&h->lock); } +EXPORT_SYMBOL_GPL(raw_hash_sk); -static void raw_v4_unhash(struct sock *sk) +void raw_unhash_sk(struct sock *sk, struct raw_hashinfo *h) { - write_lock_bh(&raw_v4_lock); + write_lock_bh(&h->lock); if (sk_del_node_init(sk)) - sock_prot_dec_use(sk->sk_prot); - write_unlock_bh(&raw_v4_lock); + sock_prot_inuse_add(sk->sk_prot, -1); + write_unlock_bh(&h->lock); +} +EXPORT_SYMBOL_GPL(raw_unhash_sk); + +static void raw_v4_hash(struct sock *sk) +{ + raw_hash_sk(sk, &raw_v4_hashinfo); } -struct sock *__raw_v4_lookup(struct sock *sk, unsigned short num, - __be32 raddr, __be32 laddr, - int dif) +static void raw_v4_unhash(struct sock *sk) +{ + raw_unhash_sk(sk, &raw_v4_hashinfo); +} + +static struct sock *__raw_v4_lookup(struct net *net, struct sock *sk, + unsigned short num, __be32 raddr, __be32 laddr, int dif) { struct hlist_node *node; sk_for_each_from(sk, node) { struct inet_sock *inet = inet_sk(sk); - if (inet->num == num && + if (sk->sk_net == net && inet->num == num && !(inet->daddr && inet->daddr != raddr) && !(inet->rcv_saddr && inet->rcv_saddr != laddr) && !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)) @@ -150,17 +163,20 @@ static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb) * RFC 1122: SHOULD pass TOS value up to the transport layer. * -> It does. And not only TOS, but all IP header. */ -int raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash) +static int raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash) { struct sock *sk; struct hlist_head *head; int delivered = 0; + struct net *net; - read_lock(&raw_v4_lock); - head = &raw_v4_htable[hash]; + read_lock(&raw_v4_hashinfo.lock); + head = &raw_v4_hashinfo.ht[hash]; if (hlist_empty(head)) goto out; - sk = __raw_v4_lookup(__sk_head(head), iph->protocol, + + net = skb->dev->nd_net; + sk = __raw_v4_lookup(net, __sk_head(head), iph->protocol, iph->saddr, iph->daddr, skb->dev->ifindex); @@ -173,16 +189,34 @@ int raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash) if (clone) raw_rcv(sk, clone); } - sk = __raw_v4_lookup(sk_next(sk), iph->protocol, + sk = __raw_v4_lookup(net, sk_next(sk), iph->protocol, iph->saddr, iph->daddr, skb->dev->ifindex); } out: - read_unlock(&raw_v4_lock); + read_unlock(&raw_v4_hashinfo.lock); return delivered; } -void raw_err (struct sock *sk, struct sk_buff *skb, u32 info) +int raw_local_deliver(struct sk_buff *skb, int protocol) +{ + int hash; + struct sock *raw_sk; + + hash = protocol & (RAW_HTABLE_SIZE - 1); + raw_sk = sk_head(&raw_v4_hashinfo.ht[hash]); + + /* If there maybe a raw socket we must check - if not we + * don't care less + */ + if (raw_sk && !raw_v4_input(skb, ip_hdr(skb), hash)) + raw_sk = NULL; + + return raw_sk != NULL; + +} + +static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info) { struct inet_sock *inet = inet_sk(sk); const int type = icmp_hdr(skb)->type; @@ -236,12 +270,38 @@ void raw_err (struct sock *sk, struct sk_buff *skb, u32 info) } } +void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info) +{ + int hash; + struct sock *raw_sk; + struct iphdr *iph; + struct net *net; + + hash = protocol & (RAW_HTABLE_SIZE - 1); + + read_lock(&raw_v4_hashinfo.lock); + raw_sk = sk_head(&raw_v4_hashinfo.ht[hash]); + if (raw_sk != NULL) { + iph = (struct iphdr *)skb->data; + net = skb->dev->nd_net; + + while ((raw_sk = __raw_v4_lookup(net, raw_sk, protocol, + iph->daddr, iph->saddr, + skb->dev->ifindex)) != NULL) { + raw_err(raw_sk, skb, info); + raw_sk = sk_next(raw_sk); + iph = (struct iphdr *)skb->data; + } + } + read_unlock(&raw_v4_hashinfo.lock); +} + static int raw_rcv_skb(struct sock * sk, struct sk_buff * skb) { /* Charge it to the socket. */ if (sock_queue_rcv_skb(sk, skb) < 0) { - /* FIXME: increment a raw drops counter here */ + atomic_inc(&sk->sk_drops); kfree_skb(skb); return NET_RX_DROP; } @@ -252,6 +312,7 @@ static int raw_rcv_skb(struct sock * sk, struct sk_buff * skb) int raw_rcv(struct sock *sk, struct sk_buff *skb) { if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { + atomic_inc(&sk->sk_drops); kfree_skb(skb); return NET_RX_DROP; } @@ -320,7 +381,7 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length, icmp_out_count(((struct icmphdr *) skb_transport_header(skb))->type); - err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev, + err = NF_HOOK(PF_INET, NF_INET_LOCAL_OUT, skb, NULL, rt->u.dst.dev, dst_output); if (err > 0) err = inet->recverr ? net_xmit_errno(err) : 0; @@ -474,7 +535,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, if (msg->msg_flags & MSG_DONTROUTE) tos |= RTO_ONLINK; - if (MULTICAST(daddr)) { + if (ipv4_is_multicast(daddr)) { if (!ipc.oif) ipc.oif = inet->mc_index; if (!saddr) @@ -497,7 +558,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, } security_sk_classify_flow(sk, &fl); - err = ip_route_output_flow(&rt, &fl, sk, 1); + err = ip_route_output_flow(&init_net, &rt, &fl, sk, 1); } if (err) goto done; @@ -564,7 +625,7 @@ static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) if (sk->sk_state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_in)) goto out; - chk_addr_ret = inet_addr_type(addr->sin_addr.s_addr); + chk_addr_ret = inet_addr_type(sk->sk_net, addr->sin_addr.s_addr); ret = -EADDRNOTAVAIL; if (addr->sin_addr.s_addr && chk_addr_ret != RTN_LOCAL && chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST) @@ -789,22 +850,18 @@ struct proto raw_prot = { }; #ifdef CONFIG_PROC_FS -struct raw_iter_state { - int bucket; -}; - -#define raw_seq_private(seq) ((struct raw_iter_state *)(seq)->private) - static struct sock *raw_get_first(struct seq_file *seq) { struct sock *sk; struct raw_iter_state* state = raw_seq_private(seq); - for (state->bucket = 0; state->bucket < RAWV4_HTABLE_SIZE; ++state->bucket) { + for (state->bucket = 0; state->bucket < RAW_HTABLE_SIZE; + ++state->bucket) { struct hlist_node *node; - sk_for_each(sk, node, &raw_v4_htable[state->bucket]) - if (sk->sk_family == PF_INET) + sk_for_each(sk, node, &state->h->ht[state->bucket]) + if (sk->sk_net == state->p.net && + sk->sk_family == state->family) goto found; } sk = NULL; @@ -820,10 +877,11 @@ static struct sock *raw_get_next(struct seq_file *seq, struct sock *sk) sk = sk_next(sk); try_again: ; - } while (sk && sk->sk_family != PF_INET); + } while (sk && sk->sk_net != state->p.net && + sk->sk_family != state->family); - if (!sk && ++state->bucket < RAWV4_HTABLE_SIZE) { - sk = sk_head(&raw_v4_htable[state->bucket]); + if (!sk && ++state->bucket < RAW_HTABLE_SIZE) { + sk = sk_head(&state->h->ht[state->bucket]); goto try_again; } return sk; @@ -839,13 +897,16 @@ static struct sock *raw_get_idx(struct seq_file *seq, loff_t pos) return pos ? NULL : sk; } -static void *raw_seq_start(struct seq_file *seq, loff_t *pos) +void *raw_seq_start(struct seq_file *seq, loff_t *pos) { - read_lock(&raw_v4_lock); + struct raw_iter_state *state = raw_seq_private(seq); + + read_lock(&state->h->lock); return *pos ? raw_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; } +EXPORT_SYMBOL_GPL(raw_seq_start); -static void *raw_seq_next(struct seq_file *seq, void *v, loff_t *pos) +void *raw_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct sock *sk; @@ -856,11 +917,15 @@ static void *raw_seq_next(struct seq_file *seq, void *v, loff_t *pos) ++*pos; return sk; } +EXPORT_SYMBOL_GPL(raw_seq_next); -static void raw_seq_stop(struct seq_file *seq, void *v) +void raw_seq_stop(struct seq_file *seq, void *v) { - read_unlock(&raw_v4_lock); + struct raw_iter_state *state = raw_seq_private(seq); + + read_unlock(&state->h->lock); } +EXPORT_SYMBOL_GPL(raw_seq_stop); static __inline__ char *get_raw_sock(struct sock *sp, char *tmpbuf, int i) { @@ -871,28 +936,30 @@ static __inline__ char *get_raw_sock(struct sock *sp, char *tmpbuf, int i) srcp = inet->num; sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X" - " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p", + " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d", i, src, srcp, dest, destp, sp->sk_state, atomic_read(&sp->sk_wmem_alloc), atomic_read(&sp->sk_rmem_alloc), 0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp), - atomic_read(&sp->sk_refcnt), sp); + atomic_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops)); return tmpbuf; } +#define TMPSZ 128 + static int raw_seq_show(struct seq_file *seq, void *v) { - char tmpbuf[129]; + char tmpbuf[TMPSZ+1]; if (v == SEQ_START_TOKEN) - seq_printf(seq, "%-127s\n", + seq_printf(seq, "%-*s\n", TMPSZ-1, " sl local_address rem_address st tx_queue " "rx_queue tr tm->when retrnsmt uid timeout " - "inode"); + "inode drops"); else { struct raw_iter_state *state = raw_seq_private(seq); - seq_printf(seq, "%-127s\n", + seq_printf(seq, "%-*s\n", TMPSZ-1, get_raw_sock(v, tmpbuf, state->bucket)); } return 0; @@ -905,29 +972,62 @@ static const struct seq_operations raw_seq_ops = { .show = raw_seq_show, }; -static int raw_seq_open(struct inode *inode, struct file *file) +int raw_seq_open(struct inode *ino, struct file *file, struct raw_hashinfo *h, + unsigned short family) { - return seq_open_private(file, &raw_seq_ops, + int err; + struct raw_iter_state *i; + + err = seq_open_net(ino, file, &raw_seq_ops, sizeof(struct raw_iter_state)); + if (err < 0) + return err; + + i = raw_seq_private((struct seq_file *)file->private_data); + i->h = h; + i->family = family; + return 0; +} +EXPORT_SYMBOL_GPL(raw_seq_open); + +static int raw_v4_seq_open(struct inode *inode, struct file *file) +{ + return raw_seq_open(inode, file, &raw_v4_hashinfo, PF_INET); } static const struct file_operations raw_seq_fops = { .owner = THIS_MODULE, - .open = raw_seq_open, + .open = raw_v4_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = seq_release_net, }; -int __init raw_proc_init(void) +static __net_init int raw_init_net(struct net *net) { - if (!proc_net_fops_create(&init_net, "raw", S_IRUGO, &raw_seq_fops)) + if (!proc_net_fops_create(net, "raw", S_IRUGO, &raw_seq_fops)) return -ENOMEM; + return 0; } +static __net_exit void raw_exit_net(struct net *net) +{ + proc_net_remove(net, "raw"); +} + +static __net_initdata struct pernet_operations raw_net_ops = { + .init = raw_init_net, + .exit = raw_exit_net, +}; + +int __init raw_proc_init(void) +{ + return register_pernet_subsys(&raw_net_ops); +} + void __init raw_proc_exit(void) { - proc_net_remove(&init_net, "raw"); + unregister_pernet_subsys(&raw_net_ops); } #endif /* CONFIG_PROC_FS */ diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 28484f396b04..896c768e41a2 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -92,6 +92,7 @@ #include <linux/jhash.h> #include <linux/rcupdate.h> #include <linux/times.h> +#include <net/dst.h> #include <net/net_namespace.h> #include <net/protocol.h> #include <net/ip.h> @@ -132,13 +133,14 @@ static int ip_rt_mtu_expires = 10 * 60 * HZ; static int ip_rt_min_pmtu = 512 + 20 + 20; static int ip_rt_min_advmss = 256; static int ip_rt_secret_interval = 10 * 60 * HZ; +static int ip_rt_flush_expected; static unsigned long rt_deadline; #define RTprint(a...) printk(KERN_DEBUG a) static struct timer_list rt_flush_timer; -static void rt_check_expire(struct work_struct *work); -static DECLARE_DELAYED_WORK(expires_work, rt_check_expire); +static void rt_worker_func(struct work_struct *work); +static DECLARE_DELAYED_WORK(expires_work, rt_worker_func); static struct timer_list rt_secret_timer; /* @@ -152,7 +154,7 @@ static void ipv4_dst_ifdown(struct dst_entry *dst, static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); static void ipv4_link_failure(struct sk_buff *skb); static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu); -static int rt_garbage_collect(void); +static int rt_garbage_collect(struct dst_ops *ops); static struct dst_ops ipv4_dst_ops = { @@ -165,6 +167,7 @@ static struct dst_ops ipv4_dst_ops = { .negative_advice = ipv4_negative_advice, .link_failure = ipv4_link_failure, .update_pmtu = ip_rt_update_pmtu, + .local_out = ip_local_out, .entry_size = sizeof(struct rtable), }; @@ -232,16 +235,25 @@ struct rt_hash_bucket { static spinlock_t *rt_hash_locks; # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)] -# define rt_hash_lock_init() { \ - int i; \ - rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \ - if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \ - for (i = 0; i < RT_HASH_LOCK_SZ; i++) \ - spin_lock_init(&rt_hash_locks[i]); \ - } + +static __init void rt_hash_lock_init(void) +{ + int i; + + rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, + GFP_KERNEL); + if (!rt_hash_locks) + panic("IP: failed to allocate rt_hash_locks\n"); + + for (i = 0; i < RT_HASH_LOCK_SZ; i++) + spin_lock_init(&rt_hash_locks[i]); +} #else # define rt_hash_lock_addr(slot) NULL -# define rt_hash_lock_init() + +static inline void rt_hash_lock_init(void) +{ +} #endif static struct rt_hash_bucket *rt_hash_table; @@ -478,6 +490,83 @@ static const struct file_operations rt_cpu_seq_fops = { .release = seq_release, }; +#ifdef CONFIG_NET_CLS_ROUTE +static int ip_rt_acct_read(char *buffer, char **start, off_t offset, + int length, int *eof, void *data) +{ + unsigned int i; + + if ((offset & 3) || (length & 3)) + return -EIO; + + if (offset >= sizeof(struct ip_rt_acct) * 256) { + *eof = 1; + return 0; + } + + if (offset + length >= sizeof(struct ip_rt_acct) * 256) { + length = sizeof(struct ip_rt_acct) * 256 - offset; + *eof = 1; + } + + offset /= sizeof(u32); + + if (length > 0) { + u32 *dst = (u32 *) buffer; + + *start = buffer; + memset(dst, 0, length); + + for_each_possible_cpu(i) { + unsigned int j; + u32 *src; + + src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset; + for (j = 0; j < length/4; j++) + dst[j] += src[j]; + } + } + return length; +} +#endif + +static __init int ip_rt_proc_init(struct net *net) +{ + struct proc_dir_entry *pde; + + pde = proc_net_fops_create(net, "rt_cache", S_IRUGO, + &rt_cache_seq_fops); + if (!pde) + goto err1; + + pde = create_proc_entry("rt_cache", S_IRUGO, net->proc_net_stat); + if (!pde) + goto err2; + + pde->proc_fops = &rt_cpu_seq_fops; + +#ifdef CONFIG_NET_CLS_ROUTE + pde = create_proc_read_entry("rt_acct", 0, net->proc_net, + ip_rt_acct_read, NULL); + if (!pde) + goto err3; +#endif + return 0; + +#ifdef CONFIG_NET_CLS_ROUTE +err3: + remove_proc_entry("rt_cache", net->proc_net_stat); +#endif +err2: + remove_proc_entry("rt_cache", net->proc_net); +err1: + return -ENOMEM; +} +#else +static inline int ip_rt_proc_init(struct net *net) +{ + return 0; +} #endif /* CONFIG_PROC_FS */ static __inline__ void rt_free(struct rtable *rt) @@ -559,7 +648,41 @@ static inline int compare_keys(struct flowi *fl1, struct flowi *fl2) (fl1->iif ^ fl2->iif)) == 0; } -static void rt_check_expire(struct work_struct *work) +static inline int compare_netns(struct rtable *rt1, struct rtable *rt2) +{ + return rt1->u.dst.dev->nd_net == rt2->u.dst.dev->nd_net; +} + +/* + * Perform a full scan of hash table and free all entries. + * Can be called by a softirq or a process. + * In the later case, we want to be reschedule if necessary + */ +static void rt_do_flush(int process_context) +{ + unsigned int i; + struct rtable *rth, *next; + + for (i = 0; i <= rt_hash_mask; i++) { + if (process_context && need_resched()) + cond_resched(); + rth = rt_hash_table[i].chain; + if (!rth) + continue; + + spin_lock_bh(rt_hash_lock_addr(i)); + rth = rt_hash_table[i].chain; + rt_hash_table[i].chain = NULL; + spin_unlock_bh(rt_hash_lock_addr(i)); + + for (; rth; rth = next) { + next = rth->u.dst.rt_next; + rt_free(rth); + } + } +} + +static void rt_check_expire(void) { static unsigned int rover; unsigned int i = rover, goal; @@ -605,33 +728,33 @@ static void rt_check_expire(struct work_struct *work) spin_unlock_bh(rt_hash_lock_addr(i)); } rover = i; +} + +/* + * rt_worker_func() is run in process context. + * If a whole flush was scheduled, it is done. + * Else, we call rt_check_expire() to scan part of the hash table + */ +static void rt_worker_func(struct work_struct *work) +{ + if (ip_rt_flush_expected) { + ip_rt_flush_expected = 0; + rt_do_flush(1); + } else + rt_check_expire(); schedule_delayed_work(&expires_work, ip_rt_gc_interval); } /* This can run from both BH and non-BH contexts, the latter * in the case of a forced flush event. */ -static void rt_run_flush(unsigned long dummy) +static void rt_run_flush(unsigned long process_context) { - int i; - struct rtable *rth, *next; - rt_deadline = 0; get_random_bytes(&rt_hash_rnd, 4); - for (i = rt_hash_mask; i >= 0; i--) { - spin_lock_bh(rt_hash_lock_addr(i)); - rth = rt_hash_table[i].chain; - if (rth) - rt_hash_table[i].chain = NULL; - spin_unlock_bh(rt_hash_lock_addr(i)); - - for (; rth; rth = next) { - next = rth->u.dst.rt_next; - rt_free(rth); - } - } + rt_do_flush(process_context); } static DEFINE_SPINLOCK(rt_flush_lock); @@ -665,7 +788,7 @@ void rt_cache_flush(int delay) if (delay <= 0) { spin_unlock_bh(&rt_flush_lock); - rt_run_flush(0); + rt_run_flush(user_mode); return; } @@ -676,12 +799,17 @@ void rt_cache_flush(int delay) spin_unlock_bh(&rt_flush_lock); } +/* + * We change rt_hash_rnd and ask next rt_worker_func() invocation + * to perform a flush in process context + */ static void rt_secret_rebuild(unsigned long dummy) { - unsigned long now = jiffies; - - rt_cache_flush(0); - mod_timer(&rt_secret_timer, now + ip_rt_secret_interval); + get_random_bytes(&rt_hash_rnd, 4); + ip_rt_flush_expected = 1; + cancel_delayed_work(&expires_work); + schedule_delayed_work(&expires_work, HZ/10); + mod_timer(&rt_secret_timer, jiffies + ip_rt_secret_interval); } /* @@ -697,7 +825,7 @@ static void rt_secret_rebuild(unsigned long dummy) and when load increases it reduces to limit cache size. */ -static int rt_garbage_collect(void) +static int rt_garbage_collect(struct dst_ops *ops) { static unsigned long expire = RT_GC_TIMEOUT; static unsigned long last_gc; @@ -728,14 +856,14 @@ static int rt_garbage_collect(void) equilibrium = ipv4_dst_ops.gc_thresh; goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium; if (goal > 0) { - equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1); + equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1); goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium; } } else { /* We are in dangerous area. Try to reduce cache really * aggressively. */ - goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1); + goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1); equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal; } @@ -838,7 +966,7 @@ restart: spin_lock_bh(rt_hash_lock_addr(hash)); while ((rth = *rthp) != NULL) { - if (compare_keys(&rth->fl, &rt->fl)) { + if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) { /* Put it first */ *rthp = rth->u.dst.rt_next; /* @@ -912,7 +1040,7 @@ restart: int saved_int = ip_rt_gc_min_interval; ip_rt_gc_elasticity = 1; ip_rt_gc_min_interval = 0; - rt_garbage_collect(); + rt_garbage_collect(&ipv4_dst_ops); ip_rt_gc_min_interval = saved_int; ip_rt_gc_elasticity = saved_elasticity; goto restart; @@ -1031,7 +1159,8 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, return; if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) - || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw)) + || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) + || ipv4_is_zeronet(new_gw)) goto reject_redirect; if (!IN_DEV_SHARED_MEDIA(in_dev)) { @@ -1040,7 +1169,7 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev)) goto reject_redirect; } else { - if (inet_addr_type(new_gw) != RTN_UNICAST) + if (inet_addr_type(&init_net, new_gw) != RTN_UNICAST) goto reject_redirect; } @@ -1291,7 +1420,8 @@ static __inline__ unsigned short guess_mtu(unsigned short old_mtu) return 68; } -unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu) +unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph, + unsigned short new_mtu) { int i; unsigned short old_mtu = ntohs(iph->tot_len); @@ -1314,7 +1444,8 @@ unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu) rth->rt_dst == daddr && rth->rt_src == iph->saddr && rth->fl.iif == 0 && - !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) { + !(dst_metric_locked(&rth->u.dst, RTAX_MTU)) && + rth->u.dst.dev->nd_net == net) { unsigned short mtu = new_mtu; if (new_mtu < 68 || new_mtu >= old_mtu) { @@ -1389,8 +1520,9 @@ static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, { struct rtable *rt = (struct rtable *) dst; struct in_device *idev = rt->idev; - if (dev != init_net.loopback_dev && idev && idev->dev == dev) { - struct in_device *loopback_idev = in_dev_get(init_net.loopback_dev); + if (dev != dev->nd_net->loopback_dev && idev && idev->dev == dev) { + struct in_device *loopback_idev = + in_dev_get(dev->nd_net->loopback_dev); if (loopback_idev) { rt->idev = loopback_idev; in_dev_put(idev); @@ -1434,7 +1566,7 @@ void ip_rt_get_source(u8 *addr, struct rtable *rt) if (rt->fl.iif == 0) src = rt->rt_src; - else if (fib_lookup(&rt->fl, &res) == 0) { + else if (fib_lookup(rt->u.dst.dev->nd_net, &rt->fl, &res) == 0) { src = FIB_RES_PREFSRC(res); fib_res_put(&res); } else @@ -1509,12 +1641,12 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (in_dev == NULL) return -EINVAL; - if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) || - skb->protocol != htons(ETH_P_IP)) + if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || + ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP)) goto e_inval; - if (ZERONET(saddr)) { - if (!LOCAL_MCAST(daddr)) + if (ipv4_is_zeronet(saddr)) { + if (!ipv4_is_local_multicast(daddr)) goto e_inval; spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); } else if (fib_validate_source(saddr, 0, tos, 0, @@ -1556,7 +1688,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, } #ifdef CONFIG_IP_MROUTE - if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev)) + if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev)) rth->u.dst.input = ip_mr_input; #endif RT_CACHE_STAT_INC(in_slow_mc); @@ -1643,7 +1775,7 @@ static inline int __mkroute_input(struct sk_buff *skb, if (err) flags |= RTCF_DIRECTSRC; - if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) && + if (out_dev == in_dev && err && !(flags & RTCF_MASQ) && (IN_DEV_SHARED_MEDIA(out_dev) || inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) flags |= RTCF_DOREDIRECT; @@ -1652,7 +1784,7 @@ static inline int __mkroute_input(struct sk_buff *skb, /* Not IP (i.e. ARP). Do not create route, if it is * invalid for proxy arp. DNAT routes are always valid. */ - if (out_dev == in_dev && !(flags & RTCF_DNAT)) { + if (out_dev == in_dev) { err = -EINVAL; goto cleanup; } @@ -1756,6 +1888,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, __be32 spec_dst; int err = -EINVAL; int free_res = 0; + struct net * net = dev->nd_net; /* IP on this device is disabled. */ @@ -1766,7 +1899,8 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, by fib_lookup. */ - if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr)) + if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || + ipv4_is_loopback(saddr)) goto martian_source; if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0)) @@ -1775,16 +1909,17 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, /* Accept zero addresses only to limited broadcast; * I even do not know to fix it or not. Waiting for complains :-) */ - if (ZERONET(saddr)) + if (ipv4_is_zeronet(saddr)) goto martian_source; - if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr)) + if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) || + ipv4_is_loopback(daddr)) goto martian_destination; /* * Now we are ready to route packet. */ - if ((err = fib_lookup(&fl, &res)) != 0) { + if ((err = fib_lookup(net, &fl, &res)) != 0) { if (!IN_DEV_FORWARD(in_dev)) goto e_hostunreach; goto no_route; @@ -1799,7 +1934,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (res.type == RTN_LOCAL) { int result; result = fib_validate_source(saddr, daddr, tos, - init_net.loopback_dev->ifindex, + net->loopback_dev->ifindex, dev, &spec_dst, &itag); if (result < 0) goto martian_source; @@ -1825,7 +1960,7 @@ brd_input: if (skb->protocol != htons(ETH_P_IP)) goto e_inval; - if (ZERONET(saddr)) + if (ipv4_is_zeronet(saddr)) spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); else { err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, @@ -1861,7 +1996,7 @@ local_input: #endif rth->rt_iif = rth->fl.iif = dev->ifindex; - rth->u.dst.dev = init_net.loopback_dev; + rth->u.dst.dev = net->loopback_dev; dev_hold(rth->u.dst.dev); rth->idev = in_dev_get(rth->u.dst.dev); rth->rt_gateway = daddr; @@ -1921,7 +2056,9 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, struct rtable * rth; unsigned hash; int iif = dev->ifindex; + struct net *net; + net = skb->dev->nd_net; tos &= IPTOS_RT_MASK; hash = rt_hash(daddr, saddr, iif); @@ -1933,7 +2070,8 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, rth->fl.iif == iif && rth->fl.oif == 0 && rth->fl.mark == skb->mark && - rth->fl.fl4_tos == tos) { + rth->fl.fl4_tos == tos && + rth->u.dst.dev->nd_net == net) { dst_use(&rth->u.dst, jiffies); RT_CACHE_STAT_INC(in_hit); rcu_read_unlock(); @@ -1955,7 +2093,7 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, Note, that multicast routers are not affected, because route cache entry is created eventually. */ - if (MULTICAST(daddr)) { + if (ipv4_is_multicast(daddr)) { struct in_device *in_dev; rcu_read_lock(); @@ -1964,7 +2102,8 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, ip_hdr(skb)->protocol); if (our #ifdef CONFIG_IP_MROUTE - || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev)) + || (!ipv4_is_local_multicast(daddr) && + IN_DEV_MFORWARD(in_dev)) #endif ) { rcu_read_unlock(); @@ -1990,14 +2129,14 @@ static inline int __mkroute_output(struct rtable **result, u32 tos = RT_FL_TOS(oldflp); int err = 0; - if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK)) + if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK)) return -EINVAL; if (fl->fl4_dst == htonl(0xFFFFFFFF)) res->type = RTN_BROADCAST; - else if (MULTICAST(fl->fl4_dst)) + else if (ipv4_is_multicast(fl->fl4_dst)) res->type = RTN_MULTICAST; - else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst)) + else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst)) return -EINVAL; if (dev_out->flags & IFF_LOOPBACK) @@ -2077,7 +2216,7 @@ static inline int __mkroute_output(struct rtable **result, #ifdef CONFIG_IP_MROUTE if (res->type == RTN_MULTICAST) { if (IN_DEV_MFORWARD(in_dev) && - !LOCAL_MCAST(oldflp->fl4_dst)) { + !ipv4_is_local_multicast(oldflp->fl4_dst)) { rth->u.dst.input = ip_mr_input; rth->u.dst.output = ip_mc_output; } @@ -2119,7 +2258,8 @@ static inline int ip_mkroute_output(struct rtable **rp, * Major route resolver routine. */ -static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp) +static int ip_route_output_slow(struct net *net, struct rtable **rp, + const struct flowi *oldflp) { u32 tos = RT_FL_TOS(oldflp); struct flowi fl = { .nl_u = { .ip4_u = @@ -2131,7 +2271,7 @@ static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp) RT_SCOPE_UNIVERSE), } }, .mark = oldflp->mark, - .iif = init_net.loopback_dev->ifindex, + .iif = net->loopback_dev->ifindex, .oif = oldflp->oif }; struct fib_result res; unsigned flags = 0; @@ -2147,26 +2287,27 @@ static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp) if (oldflp->fl4_src) { err = -EINVAL; - if (MULTICAST(oldflp->fl4_src) || - BADCLASS(oldflp->fl4_src) || - ZERONET(oldflp->fl4_src)) + if (ipv4_is_multicast(oldflp->fl4_src) || + ipv4_is_lbcast(oldflp->fl4_src) || + ipv4_is_zeronet(oldflp->fl4_src)) goto out; /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ - dev_out = ip_dev_find(oldflp->fl4_src); + dev_out = ip_dev_find(net, oldflp->fl4_src); if (dev_out == NULL) goto out; /* I removed check for oif == dev_out->oif here. It was wrong for two reasons: - 1. ip_dev_find(saddr) can return wrong iface, if saddr is - assigned to multiple interfaces. + 1. ip_dev_find(net, saddr) can return wrong iface, if saddr + is assigned to multiple interfaces. 2. Moreover, we are allowed to send packets with saddr of another iface. --ANK */ if (oldflp->oif == 0 - && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF))) { + && (ipv4_is_multicast(oldflp->fl4_dst) || + oldflp->fl4_dst == htonl(0xFFFFFFFF))) { /* Special hack: user can direct multicasts and limited broadcast via necessary interface without fiddling with IP_MULTICAST_IF or IP_PKTINFO. @@ -2192,7 +2333,7 @@ static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp) if (oldflp->oif) { - dev_out = dev_get_by_index(&init_net, oldflp->oif); + dev_out = dev_get_by_index(net, oldflp->oif); err = -ENODEV; if (dev_out == NULL) goto out; @@ -2203,14 +2344,15 @@ static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp) goto out; /* Wrong error code */ } - if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF)) { + if (ipv4_is_local_multicast(oldflp->fl4_dst) || + oldflp->fl4_dst == htonl(0xFFFFFFFF)) { if (!fl.fl4_src) fl.fl4_src = inet_select_addr(dev_out, 0, RT_SCOPE_LINK); goto make_route; } if (!fl.fl4_src) { - if (MULTICAST(oldflp->fl4_dst)) + if (ipv4_is_multicast(oldflp->fl4_dst)) fl.fl4_src = inet_select_addr(dev_out, 0, fl.fl4_scope); else if (!oldflp->fl4_dst) @@ -2225,15 +2367,15 @@ static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp) fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK); if (dev_out) dev_put(dev_out); - dev_out = init_net.loopback_dev; + dev_out = net->loopback_dev; dev_hold(dev_out); - fl.oif = init_net.loopback_dev->ifindex; + fl.oif = net->loopback_dev->ifindex; res.type = RTN_LOCAL; flags |= RTCF_LOCAL; goto make_route; } - if (fib_lookup(&fl, &res)) { + if (fib_lookup(net, &fl, &res)) { res.fi = NULL; if (oldflp->oif) { /* Apparently, routing tables are wrong. Assume, @@ -2272,7 +2414,7 @@ static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp) fl.fl4_src = fl.fl4_dst; if (dev_out) dev_put(dev_out); - dev_out = init_net.loopback_dev; + dev_out = net->loopback_dev; dev_hold(dev_out); fl.oif = dev_out->ifindex; if (res.fi) @@ -2288,7 +2430,7 @@ static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp) else #endif if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif) - fib_select_default(&fl, &res); + fib_select_default(net, &fl, &res); if (!fl.fl4_src) fl.fl4_src = FIB_RES_PREFSRC(res); @@ -2311,7 +2453,8 @@ make_route: out: return err; } -int __ip_route_output_key(struct rtable **rp, const struct flowi *flp) +int __ip_route_output_key(struct net *net, struct rtable **rp, + const struct flowi *flp) { unsigned hash; struct rtable *rth; @@ -2327,7 +2470,8 @@ int __ip_route_output_key(struct rtable **rp, const struct flowi *flp) rth->fl.oif == flp->oif && rth->fl.mark == flp->mark && !((rth->fl.fl4_tos ^ flp->fl4_tos) & - (IPTOS_RT_MASK | RTO_ONLINK))) { + (IPTOS_RT_MASK | RTO_ONLINK)) && + rth->u.dst.dev->nd_net == net) { dst_use(&rth->u.dst, jiffies); RT_CACHE_STAT_INC(out_hit); rcu_read_unlock_bh(); @@ -2338,7 +2482,7 @@ int __ip_route_output_key(struct rtable **rp, const struct flowi *flp) } rcu_read_unlock_bh(); - return ip_route_output_slow(rp, flp); + return ip_route_output_slow(net, rp, flp); } EXPORT_SYMBOL_GPL(__ip_route_output_key); @@ -2357,12 +2501,6 @@ static struct dst_ops ipv4_dst_blackhole_ops = { }; -static int ipv4_blackhole_output(struct sk_buff *skb) -{ - kfree_skb(skb); - return 0; -} - static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock *sk) { struct rtable *ort = *rp; @@ -2374,8 +2512,8 @@ static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock atomic_set(&new->__refcnt, 1); new->__use = 1; - new->input = ipv4_blackhole_output; - new->output = ipv4_blackhole_output; + new->input = dst_discard; + new->output = dst_discard; memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32)); new->dev = ort->u.dst.dev; @@ -2406,11 +2544,12 @@ static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock return (rt ? 0 : -ENOMEM); } -int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags) +int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp, + struct sock *sk, int flags) { int err; - if ((err = __ip_route_output_key(rp, flp)) != 0) + if ((err = __ip_route_output_key(net, rp, flp)) != 0) return err; if (flp->proto) { @@ -2418,7 +2557,8 @@ int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, flp->fl4_src = (*rp)->rt_src; if (!flp->fl4_dst) flp->fl4_dst = (*rp)->rt_dst; - err = __xfrm_lookup((struct dst_entry **)rp, flp, sk, flags); + err = __xfrm_lookup((struct dst_entry **)rp, flp, sk, + flags ? XFRM_LOOKUP_WAIT : 0); if (err == -EREMOTE) err = ipv4_dst_blackhole(rp, flp, sk); @@ -2430,9 +2570,9 @@ int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, EXPORT_SYMBOL_GPL(ip_route_output_flow); -int ip_route_output_key(struct rtable **rp, struct flowi *flp) +int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp) { - return ip_route_output_flow(rp, flp, NULL, 0); + return ip_route_output_flow(net, rp, flp, NULL, 0); } static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, @@ -2499,8 +2639,8 @@ static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, #ifdef CONFIG_IP_MROUTE __be32 dst = rt->rt_dst; - if (MULTICAST(dst) && !LOCAL_MCAST(dst) && - IPV4_DEVCONF_ALL(MC_FORWARDING)) { + if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) && + IPV4_DEVCONF_ALL(&init_net, MC_FORWARDING)) { int err = ipmr_get_route(skb, r, nowait); if (err <= 0) { if (!nowait) { @@ -2531,6 +2671,7 @@ nla_put_failure: static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) { + struct net *net = in_skb->sk->sk_net; struct rtmsg *rtm; struct nlattr *tb[RTA_MAX+1]; struct rtable *rt = NULL; @@ -2540,6 +2681,9 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void int err; struct sk_buff *skb; + if (net != &init_net) + return -EINVAL; + err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy); if (err < 0) goto errout; @@ -2595,7 +2739,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void }, .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0, }; - err = ip_route_output_key(&rt, &fl); + err = ip_route_output_key(&init_net, &rt, &fl); } if (err) @@ -2610,7 +2754,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void if (err <= 0) goto errout_free; - err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid); + err = rtnl_unicast(skb, &init_net, NETLINK_CB(in_skb).pid); errout: return err; @@ -2862,51 +3006,7 @@ ctl_table ipv4_route_table[] = { #endif #ifdef CONFIG_NET_CLS_ROUTE -struct ip_rt_acct *ip_rt_acct; - -/* This code sucks. But you should have seen it before! --RR */ - -/* IP route accounting ptr for this logical cpu number. */ -#define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256) - -#ifdef CONFIG_PROC_FS -static int ip_rt_acct_read(char *buffer, char **start, off_t offset, - int length, int *eof, void *data) -{ - unsigned int i; - - if ((offset & 3) || (length & 3)) - return -EIO; - - if (offset >= sizeof(struct ip_rt_acct) * 256) { - *eof = 1; - return 0; - } - - if (offset + length >= sizeof(struct ip_rt_acct) * 256) { - length = sizeof(struct ip_rt_acct) * 256 - offset; - *eof = 1; - } - - offset /= sizeof(u32); - - if (length > 0) { - u32 *dst = (u32 *) buffer; - - *start = buffer; - memset(dst, 0, length); - - for_each_possible_cpu(i) { - unsigned int j; - u32 *src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset; - - for (j = 0; j < length/4; j++) - dst[j] += src[j]; - } - } - return length; -} -#endif /* CONFIG_PROC_FS */ +struct ip_rt_acct *ip_rt_acct __read_mostly; #endif /* CONFIG_NET_CLS_ROUTE */ static __initdata unsigned long rhash_entries; @@ -2927,16 +3027,9 @@ int __init ip_rt_init(void) (jiffies ^ (jiffies >> 7))); #ifdef CONFIG_NET_CLS_ROUTE - { - int order; - for (order = 0; - (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++) - /* NOTHING */; - ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order); + ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct)); if (!ip_rt_acct) panic("IP: failed to allocate ip_rt_acct\n"); - memset(ip_rt_acct, 0, PAGE_SIZE << order); - } #endif ipv4_dst_ops.kmem_cachep = @@ -2964,10 +3057,8 @@ int __init ip_rt_init(void) devinet_init(); ip_fib_init(); - init_timer(&rt_flush_timer); - rt_flush_timer.function = rt_run_flush; - init_timer(&rt_secret_timer); - rt_secret_timer.function = rt_secret_rebuild; + setup_timer(&rt_flush_timer, rt_run_flush, 0); + setup_timer(&rt_secret_timer, rt_secret_rebuild, 0); /* All the timers, started at system startup tend to synchronize. Perturb it a bit. @@ -2979,20 +3070,8 @@ int __init ip_rt_init(void) ip_rt_secret_interval; add_timer(&rt_secret_timer); -#ifdef CONFIG_PROC_FS - { - struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */ - if (!proc_net_fops_create(&init_net, "rt_cache", S_IRUGO, &rt_cache_seq_fops) || - !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO, - init_net.proc_net_stat))) { - return -ENOMEM; - } - rtstat_pde->proc_fops = &rt_cpu_seq_fops; - } -#ifdef CONFIG_NET_CLS_ROUTE - create_proc_read_entry("rt_acct", 0, init_net.proc_net, ip_rt_acct_read, NULL); -#endif -#endif + if (ip_rt_proc_init(&init_net)) + printk(KERN_ERR "Unable to create route proc files\n"); #ifdef CONFIG_XFRM xfrm_init(); xfrm4_init(); diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 2da1be0589a9..f470fe4511db 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -264,7 +264,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, { .sport = th->dest, .dport = th->source } } }; security_req_classify_flow(req, &fl); - if (ip_route_output_key(&rt, &fl)) { + if (ip_route_output_key(&init_net, &rt, &fl)) { reqsk_free(req); goto out; } diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index bec6fe880657..82cdf23837e3 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -13,83 +13,20 @@ #include <linux/igmp.h> #include <linux/inetdevice.h> #include <linux/seqlock.h> +#include <linux/init.h> #include <net/snmp.h> #include <net/icmp.h> #include <net/ip.h> #include <net/route.h> #include <net/tcp.h> +#include <net/udp.h> #include <net/cipso_ipv4.h> #include <net/inet_frag.h> -/* From af_inet.c */ -extern int sysctl_ip_nonlocal_bind; - -#ifdef CONFIG_SYSCTL static int zero; static int tcp_retr1_max = 255; static int ip_local_port_range_min[] = { 1, 1 }; static int ip_local_port_range_max[] = { 65535, 65535 }; -#endif - -struct ipv4_config ipv4_config; - -#ifdef CONFIG_SYSCTL - -static -int ipv4_sysctl_forward(ctl_table *ctl, int write, struct file * filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - int val = IPV4_DEVCONF_ALL(FORWARDING); - int ret; - - ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); - - if (write && IPV4_DEVCONF_ALL(FORWARDING) != val) - inet_forward_change(); - - return ret; -} - -static int ipv4_sysctl_forward_strategy(ctl_table *table, - int __user *name, int nlen, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - int *valp = table->data; - int new; - - if (!newval || !newlen) - return 0; - - if (newlen != sizeof(int)) - return -EINVAL; - - if (get_user(new, (int __user *)newval)) - return -EFAULT; - - if (new == *valp) - return 0; - - if (oldval && oldlenp) { - size_t len; - - if (get_user(len, oldlenp)) - return -EFAULT; - - if (len) { - if (len > table->maxlen) - len = table->maxlen; - if (copy_to_user(oldval, valp, len)) - return -EFAULT; - if (put_user(len, oldlenp)) - return -EFAULT; - } - } - - *valp = new; - inet_forward_change(); - return 1; -} extern seqlock_t sysctl_port_range_lock; extern int sysctl_local_port_range[2]; @@ -256,7 +193,7 @@ static int strategy_allowed_congestion_control(ctl_table *table, int __user *nam } -ctl_table ipv4_table[] = { +static struct ctl_table ipv4_table[] = { { .ctl_name = NET_IPV4_TCP_TIMESTAMPS, .procname = "tcp_timestamps", @@ -290,15 +227,6 @@ ctl_table ipv4_table[] = { .proc_handler = &proc_dointvec }, { - .ctl_name = NET_IPV4_FORWARD, - .procname = "ip_forward", - .data = &IPV4_DEVCONF_ALL(FORWARDING), - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &ipv4_sysctl_forward, - .strategy = &ipv4_sysctl_forward_strategy - }, - { .ctl_name = NET_IPV4_DEFAULT_TTL, .procname = "ip_default_ttl", .data = &sysctl_ip_default_ttl, @@ -356,22 +284,6 @@ ctl_table ipv4_table[] = { .proc_handler = &proc_dointvec }, { - .ctl_name = NET_IPV4_IPFRAG_HIGH_THRESH, - .procname = "ipfrag_high_thresh", - .data = &ip4_frags_ctl.high_thresh, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec - }, - { - .ctl_name = NET_IPV4_IPFRAG_LOW_THRESH, - .procname = "ipfrag_low_thresh", - .data = &ip4_frags_ctl.low_thresh, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec - }, - { .ctl_name = NET_IPV4_DYNADDR, .procname = "ip_dynaddr", .data = &sysctl_ip_dynaddr, @@ -380,15 +292,6 @@ ctl_table ipv4_table[] = { .proc_handler = &proc_dointvec }, { - .ctl_name = NET_IPV4_IPFRAG_TIME, - .procname = "ipfrag_time", - .data = &ip4_frags_ctl.timeout, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - .strategy = &sysctl_jiffies - }, - { .ctl_name = NET_IPV4_TCP_KEEPALIVE_TIME, .procname = "tcp_keepalive_time", .data = &sysctl_tcp_keepalive_time, @@ -731,23 +634,6 @@ ctl_table ipv4_table[] = { .proc_handler = &proc_dointvec }, { - .ctl_name = NET_IPV4_IPFRAG_SECRET_INTERVAL, - .procname = "ipfrag_secret_interval", - .data = &ip4_frags_ctl.secret_interval, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - .strategy = &sysctl_jiffies - }, - { - .procname = "ipfrag_max_dist", - .data = &sysctl_ipfrag_max_dist, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .extra1 = &zero - }, - { .ctl_name = NET_TCP_NO_METRICS_SAVE, .procname = "tcp_no_metrics_save", .data = &sysctl_tcp_nometrics_save, @@ -885,9 +771,52 @@ ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "udp_mem", + .data = &sysctl_udp_mem, + .maxlen = sizeof(sysctl_udp_mem), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "udp_rmem_min", + .data = &sysctl_udp_rmem_min, + .maxlen = sizeof(sysctl_udp_rmem_min), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "udp_wmem_min", + .data = &sysctl_udp_wmem_min, + .maxlen = sizeof(sysctl_udp_wmem_min), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero + }, { .ctl_name = 0 } }; -#endif /* CONFIG_SYSCTL */ +struct ctl_path net_ipv4_ctl_path[] = { + { .procname = "net", .ctl_name = CTL_NET, }, + { .procname = "ipv4", .ctl_name = NET_IPV4, }, + { }, +}; +EXPORT_SYMBOL_GPL(net_ipv4_ctl_path); + +static __init int sysctl_ipv4_init(void) +{ + struct ctl_table_header *hdr; + + hdr = register_sysctl_paths(net_ipv4_ctl_path, ipv4_table); + return hdr == NULL ? -ENOMEM : 0; +} -EXPORT_SYMBOL(ipv4_config); +__initcall(sysctl_ipv4_init); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 8e65182f7af1..a0d373bd9065 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -254,6 +254,10 @@ #include <linux/poll.h> #include <linux/init.h> #include <linux/fs.h> +#include <linux/skbuff.h> +#include <linux/splice.h> +#include <linux/net.h> +#include <linux/socket.h> #include <linux/random.h> #include <linux/bootmem.h> #include <linux/cache.h> @@ -265,6 +269,7 @@ #include <net/xfrm.h> #include <net/ip.h> #include <net/netdma.h> +#include <net/sock.h> #include <asm/uaccess.h> #include <asm/ioctls.h> @@ -292,9 +297,18 @@ EXPORT_SYMBOL(tcp_memory_allocated); EXPORT_SYMBOL(tcp_sockets_allocated); /* + * TCP splice context + */ +struct tcp_splice_state { + struct pipe_inode_info *pipe; + size_t len; + unsigned int flags; +}; + +/* * Pressure flag: try to collapse. * Technical note: it is used by multiple contexts non atomically. - * All the sk_stream_mem_schedule() is of this nature: accounting + * All the __sk_mem_schedule() is of this nature: accounting * is strict, actions are advisory and have some latency. */ int tcp_memory_pressure __read_mostly; @@ -471,7 +485,8 @@ static inline void skb_entail(struct sock *sk, struct sk_buff *skb) tcb->sacked = 0; skb_header_release(skb); tcp_add_write_queue_tail(sk, skb); - sk_charge_skb(sk, skb); + sk->sk_wmem_queued += skb->truesize; + sk_mem_charge(sk, skb->truesize); if (tp->nonagle & TCP_NAGLE_PUSH) tp->nonagle &= ~TCP_NAGLE_PUSH; } @@ -482,7 +497,6 @@ static inline void tcp_mark_urg(struct tcp_sock *tp, int flags, if (flags & MSG_OOB) { tp->urg_mode = 1; tp->snd_up = tp->write_seq; - TCP_SKB_CB(skb)->sacked |= TCPCB_URG; } } @@ -501,6 +515,145 @@ static inline void tcp_push(struct sock *sk, int flags, int mss_now, } } +static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, + unsigned int offset, size_t len) +{ + struct tcp_splice_state *tss = rd_desc->arg.data; + + return skb_splice_bits(skb, offset, tss->pipe, tss->len, tss->flags); +} + +static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss) +{ + /* Store TCP splice context information in read_descriptor_t. */ + read_descriptor_t rd_desc = { + .arg.data = tss, + }; + + return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv); +} + +/** + * tcp_splice_read - splice data from TCP socket to a pipe + * @sock: socket to splice from + * @ppos: position (not valid) + * @pipe: pipe to splice to + * @len: number of bytes to splice + * @flags: splice modifier flags + * + * Description: + * Will read pages from given socket and fill them into a pipe. + * + **/ +ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + struct sock *sk = sock->sk; + struct tcp_splice_state tss = { + .pipe = pipe, + .len = len, + .flags = flags, + }; + long timeo; + ssize_t spliced; + int ret; + + /* + * We can't seek on a socket input + */ + if (unlikely(*ppos)) + return -ESPIPE; + + ret = spliced = 0; + + lock_sock(sk); + + timeo = sock_rcvtimeo(sk, flags & SPLICE_F_NONBLOCK); + while (tss.len) { + ret = __tcp_splice_read(sk, &tss); + if (ret < 0) + break; + else if (!ret) { + if (spliced) + break; + if (flags & SPLICE_F_NONBLOCK) { + ret = -EAGAIN; + break; + } + if (sock_flag(sk, SOCK_DONE)) + break; + if (sk->sk_err) { + ret = sock_error(sk); + break; + } + if (sk->sk_shutdown & RCV_SHUTDOWN) + break; + if (sk->sk_state == TCP_CLOSE) { + /* + * This occurs when user tries to read + * from never connected socket. + */ + if (!sock_flag(sk, SOCK_DONE)) + ret = -ENOTCONN; + break; + } + if (!timeo) { + ret = -EAGAIN; + break; + } + sk_wait_data(sk, &timeo); + if (signal_pending(current)) { + ret = sock_intr_errno(timeo); + break; + } + continue; + } + tss.len -= ret; + spliced += ret; + + release_sock(sk); + lock_sock(sk); + + if (sk->sk_err || sk->sk_state == TCP_CLOSE || + (sk->sk_shutdown & RCV_SHUTDOWN) || !timeo || + signal_pending(current)) + break; + } + + release_sock(sk); + + if (spliced) + return spliced; + + return ret; +} + +struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp) +{ + struct sk_buff *skb; + + /* The TCP header must be at least 32-bit aligned. */ + size = ALIGN(size, 4); + + skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp); + if (skb) { + if (sk_wmem_schedule(sk, skb->truesize)) { + /* + * Make sure that we have exactly size bytes + * available to the caller, no more, no less. + */ + skb_reserve(skb, skb_tailroom(skb) - size); + return skb; + } + __kfree_skb(skb); + } else { + sk->sk_prot->enter_memory_pressure(); + sk_stream_moderate_sndbuf(sk); + } + return NULL; +} + static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags) { @@ -537,8 +690,7 @@ new_segment: if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; - skb = sk_stream_alloc_pskb(sk, 0, 0, - sk->sk_allocation); + skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation); if (!skb) goto wait_for_memory; @@ -555,7 +707,7 @@ new_segment: tcp_mark_push(tp, skb); goto new_segment; } - if (!sk_stream_wmem_schedule(sk, copy)) + if (!sk_wmem_schedule(sk, copy)) goto wait_for_memory; if (can_coalesce) { @@ -569,7 +721,7 @@ new_segment: skb->data_len += copy; skb->truesize += copy; sk->sk_wmem_queued += copy; - sk->sk_forward_alloc -= copy; + sk_mem_charge(sk, copy); skb->ip_summed = CHECKSUM_PARTIAL; tp->write_seq += copy; TCP_SKB_CB(skb)->end_seq += copy; @@ -718,8 +870,8 @@ new_segment: if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; - skb = sk_stream_alloc_pskb(sk, select_size(sk), - 0, sk->sk_allocation); + skb = sk_stream_alloc_skb(sk, select_size(sk), + sk->sk_allocation); if (!skb) goto wait_for_memory; @@ -776,7 +928,7 @@ new_segment: if (copy > PAGE_SIZE - off) copy = PAGE_SIZE - off; - if (!sk_stream_wmem_schedule(sk, copy)) + if (!sk_wmem_schedule(sk, copy)) goto wait_for_memory; if (!page) { @@ -867,7 +1019,7 @@ do_fault: * reset, where we can be unlinking the send_head. */ tcp_check_send_head(sk, skb); - sk_stream_free_skb(sk, skb); + sk_wmem_free_skb(sk, skb); } do_error: @@ -1500,6 +1652,41 @@ recv_urg: goto out; } +void tcp_set_state(struct sock *sk, int state) +{ + int oldstate = sk->sk_state; + + switch (state) { + case TCP_ESTABLISHED: + if (oldstate != TCP_ESTABLISHED) + TCP_INC_STATS(TCP_MIB_CURRESTAB); + break; + + case TCP_CLOSE: + if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED) + TCP_INC_STATS(TCP_MIB_ESTABRESETS); + + sk->sk_prot->unhash(sk); + if (inet_csk(sk)->icsk_bind_hash && + !(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) + inet_put_port(&tcp_hashinfo, sk); + /* fall through */ + default: + if (oldstate==TCP_ESTABLISHED) + TCP_DEC_STATS(TCP_MIB_CURRESTAB); + } + + /* Change state AFTER socket is unhashed to avoid closed + * socket sitting in hash tables. + */ + sk->sk_state = state; + +#ifdef STATE_TRACE + SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n",sk, statename[oldstate],statename[state]); +#endif +} +EXPORT_SYMBOL_GPL(tcp_set_state); + /* * State processing on a close. This implements the state shift for * sending our FIN frame. Note that we only send a FIN for some @@ -1586,7 +1773,7 @@ void tcp_close(struct sock *sk, long timeout) __kfree_skb(skb); } - sk_stream_mem_reclaim(sk); + sk_mem_reclaim(sk); /* As outlined in RFC 2525, section 2.17, we send a RST here because * data was lost. To witness the awful effects of the old behavior of @@ -1689,7 +1876,7 @@ adjudge_to_death: } } if (sk->sk_state != TCP_CLOSE) { - sk_stream_mem_reclaim(sk); + sk_mem_reclaim(sk); if (tcp_too_many_orphans(sk, atomic_read(sk->sk_prot->orphan_count))) { if (net_ratelimit()) @@ -2411,7 +2598,6 @@ void tcp_done(struct sock *sk) } EXPORT_SYMBOL_GPL(tcp_done); -extern void __skb_cb_too_small_for_tcp(int, int); extern struct tcp_congestion_ops tcp_reno; static __initdata unsigned long thash_entries; @@ -2430,9 +2616,7 @@ void __init tcp_init(void) unsigned long limit; int order, i, max_share; - if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb)) - __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb), - sizeof(skb->cb)); + BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); tcp_hashinfo.bind_bucket_cachep = kmem_cache_create("tcp_bind_bucket", @@ -2509,11 +2693,11 @@ void __init tcp_init(void) limit = ((unsigned long)sysctl_tcp_mem[1]) << (PAGE_SHIFT - 7); max_share = min(4UL*1024*1024, limit); - sysctl_tcp_wmem[0] = SK_STREAM_MEM_QUANTUM; + sysctl_tcp_wmem[0] = SK_MEM_QUANTUM; sysctl_tcp_wmem[1] = 16*1024; sysctl_tcp_wmem[2] = max(64*1024, max_share); - sysctl_tcp_rmem[0] = SK_STREAM_MEM_QUANTUM; + sysctl_tcp_rmem[0] = SK_MEM_QUANTUM; sysctl_tcp_rmem[1] = 87380; sysctl_tcp_rmem[2] = max(87380, max_share); @@ -2532,6 +2716,7 @@ EXPORT_SYMBOL(tcp_poll); EXPORT_SYMBOL(tcp_read_sock); EXPORT_SYMBOL(tcp_recvmsg); EXPORT_SYMBOL(tcp_sendmsg); +EXPORT_SYMBOL(tcp_splice_read); EXPORT_SYMBOL(tcp_sendpage); EXPORT_SYMBOL(tcp_setsockopt); EXPORT_SYMBOL(tcp_shutdown); diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index 5dba0fc8f579..5212ed9b0c98 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c @@ -136,8 +136,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) ca->cnt = 1; } -static void bictcp_cong_avoid(struct sock *sk, u32 ack, - u32 in_flight, int data_acked) +static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) { struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 55fca1820c34..3a6be23d222f 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -274,6 +274,27 @@ int tcp_set_congestion_control(struct sock *sk, const char *name) return err; } +/* RFC2861 Check whether we are limited by application or congestion window + * This is the inverse of cwnd check in tcp_tso_should_defer + */ +int tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight) +{ + const struct tcp_sock *tp = tcp_sk(sk); + u32 left; + + if (in_flight >= tp->snd_cwnd) + return 1; + + if (!sk_can_gso(sk)) + return 0; + + left = tp->snd_cwnd - in_flight; + if (sysctl_tcp_tso_win_divisor) + return left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd; + else + return left <= tcp_max_burst(tp); +} +EXPORT_SYMBOL_GPL(tcp_is_cwnd_limited); /* * Slow start is used when congestion window is less than slow start @@ -324,7 +345,7 @@ EXPORT_SYMBOL_GPL(tcp_slow_start); /* This is Jacobson's slow start and congestion avoidance. * SIGCOMM '88, p. 328. */ -void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight, int flag) +void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) { struct tcp_sock *tp = tcp_sk(sk); diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 80bd084a9f91..3aa0b23c1ea0 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -246,8 +246,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) ca->cnt = 1; } -static void bictcp_cong_avoid(struct sock *sk, u32 ack, - u32 in_flight, int data_acked) +static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) { struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c index 14a073d8b60f..8b6caaf75bb9 100644 --- a/net/ipv4/tcp_highspeed.c +++ b/net/ipv4/tcp_highspeed.c @@ -109,8 +109,7 @@ static void hstcp_init(struct sock *sk) tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128); } -static void hstcp_cong_avoid(struct sock *sk, u32 adk, - u32 in_flight, int data_acked) +static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 in_flight) { struct tcp_sock *tp = tcp_sk(sk); struct hstcp *ca = inet_csk_ca(sk); diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index 5215691f2760..af99776146ff 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c @@ -225,8 +225,7 @@ static u32 htcp_recalc_ssthresh(struct sock *sk) return max((tp->snd_cwnd * ca->beta) >> 7, 2U); } -static void htcp_cong_avoid(struct sock *sk, u32 ack, - u32 in_flight, int data_acked) +static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) { struct tcp_sock *tp = tcp_sk(sk); struct htcp *ca = inet_csk_ca(sk); diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c index b3e55cf56171..44618b675916 100644 --- a/net/ipv4/tcp_hybla.c +++ b/net/ipv4/tcp_hybla.c @@ -85,8 +85,7 @@ static inline u32 hybla_fraction(u32 odds) * o Give cwnd a new value based on the model proposed * o remember increments <1 */ -static void hybla_cong_avoid(struct sock *sk, u32 ack, - u32 in_flight, int flag) +static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) { struct tcp_sock *tp = tcp_sk(sk); struct hybla *ca = inet_csk_ca(sk); @@ -103,7 +102,7 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, return; if (!ca->hybla_en) - return tcp_reno_cong_avoid(sk, ack, in_flight, flag); + return tcp_reno_cong_avoid(sk, ack, in_flight); if (ca->rho == 0) hybla_recalc_param(sk); diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c index 5aa5f5496d6d..1eba160b72dc 100644 --- a/net/ipv4/tcp_illinois.c +++ b/net/ipv4/tcp_illinois.c @@ -256,8 +256,7 @@ static void tcp_illinois_state(struct sock *sk, u8 new_state) /* * Increase window in response to successful acknowledgment. */ -static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, - u32 in_flight, int flag) +static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) { struct tcp_sock *tp = tcp_sk(sk); struct illinois *ca = inet_csk_ca(sk); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b39f0d86e44c..fa2c85ca5bc3 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -105,6 +105,7 @@ int sysctl_tcp_abc __read_mostly; #define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ #define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */ #define FLAG_NONHEAD_RETRANS_ACKED 0x1000 /* Non-head rexmitted data was ACKed */ +#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */ #define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) #define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) @@ -120,8 +121,7 @@ int sysctl_tcp_abc __read_mostly; /* Adapt the MSS value used to make delayed ack decision to the * real world. */ -static void tcp_measure_rcv_mss(struct sock *sk, - const struct sk_buff *skb) +static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb) { struct inet_connection_sock *icsk = inet_csk(sk); const unsigned int lss = icsk->icsk_ack.last_seg_size; @@ -132,7 +132,7 @@ static void tcp_measure_rcv_mss(struct sock *sk, /* skb->len may jitter because of SACKs, even if peer * sends good full-sized frames. */ - len = skb_shinfo(skb)->gso_size ?: skb->len; + len = skb_shinfo(skb)->gso_size ? : skb->len; if (len >= icsk->icsk_ack.rcv_mss) { icsk->icsk_ack.rcv_mss = len; } else { @@ -172,8 +172,8 @@ static void tcp_incr_quickack(struct sock *sk) struct inet_connection_sock *icsk = inet_csk(sk); unsigned quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss); - if (quickacks==0) - quickacks=2; + if (quickacks == 0) + quickacks = 2; if (quickacks > icsk->icsk_ack.quick) icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS); } @@ -198,7 +198,7 @@ static inline int tcp_in_quickack_mode(const struct sock *sk) static inline void TCP_ECN_queue_cwr(struct tcp_sock *tp) { - if (tp->ecn_flags&TCP_ECN_OK) + if (tp->ecn_flags & TCP_ECN_OK) tp->ecn_flags |= TCP_ECN_QUEUE_CWR; } @@ -215,7 +215,7 @@ static inline void TCP_ECN_withdraw_cwr(struct tcp_sock *tp) static inline void TCP_ECN_check_ce(struct tcp_sock *tp, struct sk_buff *skb) { - if (tp->ecn_flags&TCP_ECN_OK) { + if (tp->ecn_flags & TCP_ECN_OK) { if (INET_ECN_is_ce(TCP_SKB_CB(skb)->flags)) tp->ecn_flags |= TCP_ECN_DEMAND_CWR; /* Funny extension: if ECT is not set on a segment, @@ -228,19 +228,19 @@ static inline void TCP_ECN_check_ce(struct tcp_sock *tp, struct sk_buff *skb) static inline void TCP_ECN_rcv_synack(struct tcp_sock *tp, struct tcphdr *th) { - if ((tp->ecn_flags&TCP_ECN_OK) && (!th->ece || th->cwr)) + if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || th->cwr)) tp->ecn_flags &= ~TCP_ECN_OK; } static inline void TCP_ECN_rcv_syn(struct tcp_sock *tp, struct tcphdr *th) { - if ((tp->ecn_flags&TCP_ECN_OK) && (!th->ece || !th->cwr)) + if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || !th->cwr)) tp->ecn_flags &= ~TCP_ECN_OK; } static inline int TCP_ECN_rcv_ecn_echo(struct tcp_sock *tp, struct tcphdr *th) { - if (th->ece && !th->syn && (tp->ecn_flags&TCP_ECN_OK)) + if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK)) return 1; return 0; } @@ -289,8 +289,8 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); /* Optimize this! */ - int truesize = tcp_win_from_space(skb->truesize)/2; - int window = tcp_win_from_space(sysctl_tcp_rmem[2])/2; + int truesize = tcp_win_from_space(skb->truesize) >> 1; + int window = tcp_win_from_space(sysctl_tcp_rmem[2]) >> 1; while (tp->rcv_ssthresh <= window) { if (truesize <= skb->len) @@ -302,8 +302,7 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb) return 0; } -static void tcp_grow_window(struct sock *sk, - struct sk_buff *skb) +static void tcp_grow_window(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); @@ -317,12 +316,13 @@ static void tcp_grow_window(struct sock *sk, * will fit to rcvbuf in future. */ if (tcp_win_from_space(skb->truesize) <= skb->len) - incr = 2*tp->advmss; + incr = 2 * tp->advmss; else incr = __tcp_grow_window(sk, skb); if (incr) { - tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr, tp->window_clamp); + tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr, + tp->window_clamp); inet_csk(sk)->icsk_ack.quick |= 1; } } @@ -397,10 +397,9 @@ static void tcp_clamp_window(struct sock *sk) sysctl_tcp_rmem[2]); } if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) - tp->rcv_ssthresh = min(tp->window_clamp, 2U*tp->advmss); + tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss); } - /* Initialize RCV_MSS value. * RCV_MSS is an our guess about MSS used by the peer. * We haven't any direct information about the MSS. @@ -413,7 +412,7 @@ void tcp_initialize_rcv_mss(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache); - hint = min(hint, tp->rcv_wnd/2); + hint = min(hint, tp->rcv_wnd / 2); hint = min(hint, TCP_MIN_RCVMSS); hint = max(hint, TCP_MIN_MSS); @@ -470,16 +469,15 @@ static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp) goto new_measure; if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq)) return; - tcp_rcv_rtt_update(tp, - jiffies - tp->rcv_rtt_est.time, - 1); + tcp_rcv_rtt_update(tp, jiffies - tp->rcv_rtt_est.time, 1); new_measure: tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd; tp->rcv_rtt_est.time = tcp_time_stamp; } -static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, const struct sk_buff *skb) +static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, + const struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); if (tp->rx_opt.rcv_tsecr && @@ -502,8 +500,7 @@ void tcp_rcv_space_adjust(struct sock *sk) goto new_measure; time = tcp_time_stamp - tp->rcvq_space.time; - if (time < (tp->rcv_rtt_est.rtt >> 3) || - tp->rcv_rtt_est.rtt == 0) + if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0) return; space = 2 * (tp->copied_seq - tp->rcvq_space.seq); @@ -579,7 +576,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb) } else { int m = now - icsk->icsk_ack.lrcvtime; - if (m <= TCP_ATO_MIN/2) { + if (m <= TCP_ATO_MIN / 2) { /* The fastest case is the first. */ icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + TCP_ATO_MIN / 2; } else if (m < icsk->icsk_ack.ato) { @@ -591,7 +588,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb) * restart window, so that we send ACKs quickly. */ tcp_incr_quickack(sk); - sk_stream_mem_reclaim(sk); + sk_mem_reclaim(sk); } } icsk->icsk_ack.lrcvtime = now; @@ -608,7 +605,7 @@ static u32 tcp_rto_min(struct sock *sk) u32 rto_min = TCP_RTO_MIN; if (dst && dst_metric_locked(dst, RTAX_RTO_MIN)) - rto_min = dst->metrics[RTAX_RTO_MIN-1]; + rto_min = dst->metrics[RTAX_RTO_MIN - 1]; return rto_min; } @@ -671,14 +668,14 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) } if (after(tp->snd_una, tp->rtt_seq)) { if (tp->mdev_max < tp->rttvar) - tp->rttvar -= (tp->rttvar-tp->mdev_max)>>2; + tp->rttvar -= (tp->rttvar - tp->mdev_max) >> 2; tp->rtt_seq = tp->snd_nxt; tp->mdev_max = tcp_rto_min(sk); } } else { /* no previous measure. */ - tp->srtt = m<<3; /* take the measured time to be rtt */ - tp->mdev = m<<1; /* make sure rto = 3*rtt */ + tp->srtt = m << 3; /* take the measured time to be rtt */ + tp->mdev = m << 1; /* make sure rto = 3*rtt */ tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); tp->rtt_seq = tp->snd_nxt; } @@ -732,7 +729,7 @@ void tcp_update_metrics(struct sock *sk) dst_confirm(dst); - if (dst && (dst->flags&DST_HOST)) { + if (dst && (dst->flags & DST_HOST)) { const struct inet_connection_sock *icsk = inet_csk(sk); int m; @@ -742,7 +739,7 @@ void tcp_update_metrics(struct sock *sk) * Reset our results. */ if (!(dst_metric_locked(dst, RTAX_RTT))) - dst->metrics[RTAX_RTT-1] = 0; + dst->metrics[RTAX_RTT - 1] = 0; return; } @@ -754,9 +751,9 @@ void tcp_update_metrics(struct sock *sk) */ if (!(dst_metric_locked(dst, RTAX_RTT))) { if (m <= 0) - dst->metrics[RTAX_RTT-1] = tp->srtt; + dst->metrics[RTAX_RTT - 1] = tp->srtt; else - dst->metrics[RTAX_RTT-1] -= (m>>3); + dst->metrics[RTAX_RTT - 1] -= (m >> 3); } if (!(dst_metric_locked(dst, RTAX_RTTVAR))) { @@ -769,7 +766,7 @@ void tcp_update_metrics(struct sock *sk) m = tp->mdev; if (m >= dst_metric(dst, RTAX_RTTVAR)) - dst->metrics[RTAX_RTTVAR-1] = m; + dst->metrics[RTAX_RTTVAR - 1] = m; else dst->metrics[RTAX_RTTVAR-1] -= (dst->metrics[RTAX_RTTVAR-1] - m)>>2; @@ -783,7 +780,7 @@ void tcp_update_metrics(struct sock *sk) dst->metrics[RTAX_SSTHRESH-1] = tp->snd_cwnd >> 1; if (!dst_metric_locked(dst, RTAX_CWND) && tp->snd_cwnd > dst_metric(dst, RTAX_CWND)) - dst->metrics[RTAX_CWND-1] = tp->snd_cwnd; + dst->metrics[RTAX_CWND - 1] = tp->snd_cwnd; } else if (tp->snd_cwnd > tp->snd_ssthresh && icsk->icsk_ca_state == TCP_CA_Open) { /* Cong. avoidance phase, cwnd is reliable. */ @@ -863,6 +860,9 @@ void tcp_enter_cwr(struct sock *sk, const int set_ssthresh) */ static void tcp_disable_fack(struct tcp_sock *tp) { + /* RFC3517 uses different metric in lost marker => reset on change */ + if (tcp_is_fack(tp)) + tp->lost_skb_hint = NULL; tp->rx_opt.sack_ok &= ~2; } @@ -1112,16 +1112,22 @@ static int tcp_is_sackblock_valid(struct tcp_sock *tp, int is_dsack, * * Search retransmitted skbs from write_queue that were sent when snd_nxt was * less than what is now known to be received by the other end (derived from - * SACK blocks by the caller). Also calculate the lowest snd_nxt among the - * remaining retransmitted skbs to avoid some costly processing per ACKs. + * highest SACK block). Also calculate the lowest snd_nxt among the remaining + * retransmitted skbs to avoid some costly processing per ACKs. */ -static int tcp_mark_lost_retrans(struct sock *sk, u32 received_upto) +static void tcp_mark_lost_retrans(struct sock *sk) { + const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - int flag = 0; int cnt = 0; u32 new_low_seq = tp->snd_nxt; + u32 received_upto = tcp_highest_sack_seq(tp); + + if (!tcp_is_fack(tp) || !tp->retrans_out || + !after(received_upto, tp->lost_retrans_low) || + icsk->icsk_ca_state != TCP_CA_Recovery) + return; tcp_for_write_queue(skb, sk) { u32 ack_seq = TCP_SKB_CB(skb)->ack_seq; @@ -1149,9 +1155,8 @@ static int tcp_mark_lost_retrans(struct sock *sk, u32 received_upto) if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) { tp->lost_out += tcp_skb_pcount(skb); TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; - flag |= FLAG_DATA_SACKED; - NET_INC_STATS_BH(LINUX_MIB_TCPLOSTRETRANSMIT); } + NET_INC_STATS_BH(LINUX_MIB_TCPLOSTRETRANSMIT); } else { if (before(ack_seq, new_low_seq)) new_low_seq = ack_seq; @@ -1161,8 +1166,6 @@ static int tcp_mark_lost_retrans(struct sock *sk, u32 received_upto) if (tp->retrans_out) tp->lost_retrans_low = new_low_seq; - - return flag; } static int tcp_check_dsack(struct tcp_sock *tp, struct sk_buff *ack_skb, @@ -1230,34 +1233,205 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, return in_sack; } +static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk, + int *reord, int dup_sack, int fack_count) +{ + struct tcp_sock *tp = tcp_sk(sk); + u8 sacked = TCP_SKB_CB(skb)->sacked; + int flag = 0; + + /* Account D-SACK for retransmitted packet. */ + if (dup_sack && (sacked & TCPCB_RETRANS)) { + if (after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker)) + tp->undo_retrans--; + if (sacked & TCPCB_SACKED_ACKED) + *reord = min(fack_count, *reord); + } + + /* Nothing to do; acked frame is about to be dropped (was ACKed). */ + if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) + return flag; + + if (!(sacked & TCPCB_SACKED_ACKED)) { + if (sacked & TCPCB_SACKED_RETRANS) { + /* If the segment is not tagged as lost, + * we do not clear RETRANS, believing + * that retransmission is still in flight. + */ + if (sacked & TCPCB_LOST) { + TCP_SKB_CB(skb)->sacked &= + ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); + tp->lost_out -= tcp_skb_pcount(skb); + tp->retrans_out -= tcp_skb_pcount(skb); + + /* clear lost hint */ + tp->retransmit_skb_hint = NULL; + } + } else { + if (!(sacked & TCPCB_RETRANS)) { + /* New sack for not retransmitted frame, + * which was in hole. It is reordering. + */ + if (before(TCP_SKB_CB(skb)->seq, + tcp_highest_sack_seq(tp))) + *reord = min(fack_count, *reord); + + /* SACK enhanced F-RTO (RFC4138; Appendix B) */ + if (!after(TCP_SKB_CB(skb)->end_seq, tp->frto_highmark)) + flag |= FLAG_ONLY_ORIG_SACKED; + } + + if (sacked & TCPCB_LOST) { + TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; + tp->lost_out -= tcp_skb_pcount(skb); + + /* clear lost hint */ + tp->retransmit_skb_hint = NULL; + } + } + + TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED; + flag |= FLAG_DATA_SACKED; + tp->sacked_out += tcp_skb_pcount(skb); + + fack_count += tcp_skb_pcount(skb); + + /* Lost marker hint past SACKed? Tweak RFC3517 cnt */ + if (!tcp_is_fack(tp) && (tp->lost_skb_hint != NULL) && + before(TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(tp->lost_skb_hint)->seq)) + tp->lost_cnt_hint += tcp_skb_pcount(skb); + + if (fack_count > tp->fackets_out) + tp->fackets_out = fack_count; + + if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp))) + tcp_advance_highest_sack(sk, skb); + } + + /* D-SACK. We can detect redundant retransmission in S|R and plain R + * frames and clear it. undo_retrans is decreased above, L|R frames + * are accounted above as well. + */ + if (dup_sack && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)) { + TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; + tp->retrans_out -= tcp_skb_pcount(skb); + tp->retransmit_skb_hint = NULL; + } + + return flag; +} + +static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, + struct tcp_sack_block *next_dup, + u32 start_seq, u32 end_seq, + int dup_sack_in, int *fack_count, + int *reord, int *flag) +{ + tcp_for_write_queue_from(skb, sk) { + int in_sack = 0; + int dup_sack = dup_sack_in; + + if (skb == tcp_send_head(sk)) + break; + + /* queue is in-order => we can short-circuit the walk early */ + if (!before(TCP_SKB_CB(skb)->seq, end_seq)) + break; + + if ((next_dup != NULL) && + before(TCP_SKB_CB(skb)->seq, next_dup->end_seq)) { + in_sack = tcp_match_skb_to_sack(sk, skb, + next_dup->start_seq, + next_dup->end_seq); + if (in_sack > 0) + dup_sack = 1; + } + + if (in_sack <= 0) + in_sack = tcp_match_skb_to_sack(sk, skb, start_seq, + end_seq); + if (unlikely(in_sack < 0)) + break; + + if (in_sack) + *flag |= tcp_sacktag_one(skb, sk, reord, dup_sack, + *fack_count); + + *fack_count += tcp_skb_pcount(skb); + } + return skb; +} + +/* Avoid all extra work that is being done by sacktag while walking in + * a normal way + */ +static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk, + u32 skip_to_seq) +{ + tcp_for_write_queue_from(skb, sk) { + if (skb == tcp_send_head(sk)) + break; + + if (!before(TCP_SKB_CB(skb)->end_seq, skip_to_seq)) + break; + } + return skb; +} + +static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb, + struct sock *sk, + struct tcp_sack_block *next_dup, + u32 skip_to_seq, + int *fack_count, int *reord, + int *flag) +{ + if (next_dup == NULL) + return skb; + + if (before(next_dup->start_seq, skip_to_seq)) { + skb = tcp_sacktag_skip(skb, sk, next_dup->start_seq); + tcp_sacktag_walk(skb, sk, NULL, + next_dup->start_seq, next_dup->end_seq, + 1, fack_count, reord, flag); + } + + return skb; +} + +static int tcp_sack_cache_ok(struct tcp_sock *tp, struct tcp_sack_block *cache) +{ + return cache < tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache); +} + static int -tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_una) +tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, + u32 prior_snd_una) { const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); unsigned char *ptr = (skb_transport_header(ack_skb) + TCP_SKB_CB(ack_skb)->sacked); - struct tcp_sack_block_wire *sp = (struct tcp_sack_block_wire *)(ptr+2); - struct sk_buff *cached_skb; - int num_sacks = (ptr[1] - TCPOLEN_SACK_BASE)>>3; + struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2); + struct tcp_sack_block sp[4]; + struct tcp_sack_block *cache; + struct sk_buff *skb; + int num_sacks = (ptr[1] - TCPOLEN_SACK_BASE) >> 3; + int used_sacks; int reord = tp->packets_out; - int prior_fackets; - u32 highest_sack_end_seq = tp->lost_retrans_low; int flag = 0; int found_dup_sack = 0; - int cached_fack_count; - int i; + int fack_count; + int i, j; int first_sack_index; - int force_one_sack; if (!tp->sacked_out) { if (WARN_ON(tp->fackets_out)) tp->fackets_out = 0; - tp->highest_sack = tp->snd_una; + tcp_highest_sack_reset(sk); } - prior_fackets = tp->fackets_out; - found_dup_sack = tcp_check_dsack(tp, ack_skb, sp, + found_dup_sack = tcp_check_dsack(tp, ack_skb, sp_wire, num_sacks, prior_snd_una); if (found_dup_sack) flag |= FLAG_DSACKING_ACK; @@ -1272,78 +1446,17 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ if (!tp->packets_out) goto out; - /* SACK fastpath: - * if the only SACK change is the increase of the end_seq of - * the first block then only apply that SACK block - * and use retrans queue hinting otherwise slowpath */ - force_one_sack = 1; - for (i = 0; i < num_sacks; i++) { - __be32 start_seq = sp[i].start_seq; - __be32 end_seq = sp[i].end_seq; - - if (i == 0) { - if (tp->recv_sack_cache[i].start_seq != start_seq) - force_one_sack = 0; - } else { - if ((tp->recv_sack_cache[i].start_seq != start_seq) || - (tp->recv_sack_cache[i].end_seq != end_seq)) - force_one_sack = 0; - } - tp->recv_sack_cache[i].start_seq = start_seq; - tp->recv_sack_cache[i].end_seq = end_seq; - } - /* Clear the rest of the cache sack blocks so they won't match mistakenly. */ - for (; i < ARRAY_SIZE(tp->recv_sack_cache); i++) { - tp->recv_sack_cache[i].start_seq = 0; - tp->recv_sack_cache[i].end_seq = 0; - } - + used_sacks = 0; first_sack_index = 0; - if (force_one_sack) - num_sacks = 1; - else { - int j; - tp->fastpath_skb_hint = NULL; - - /* order SACK blocks to allow in order walk of the retrans queue */ - for (i = num_sacks-1; i > 0; i--) { - for (j = 0; j < i; j++){ - if (after(ntohl(sp[j].start_seq), - ntohl(sp[j+1].start_seq))){ - struct tcp_sack_block_wire tmp; - - tmp = sp[j]; - sp[j] = sp[j+1]; - sp[j+1] = tmp; - - /* Track where the first SACK block goes to */ - if (j == first_sack_index) - first_sack_index = j+1; - } - - } - } - } - - /* Use SACK fastpath hint if valid */ - cached_skb = tp->fastpath_skb_hint; - cached_fack_count = tp->fastpath_cnt_hint; - if (!cached_skb) { - cached_skb = tcp_write_queue_head(sk); - cached_fack_count = 0; - } - for (i = 0; i < num_sacks; i++) { - struct sk_buff *skb; - __u32 start_seq = ntohl(sp->start_seq); - __u32 end_seq = ntohl(sp->end_seq); - int fack_count; - int dup_sack = (found_dup_sack && (i == first_sack_index)); - int next_dup = (found_dup_sack && (i+1 == first_sack_index)); + int dup_sack = !i && found_dup_sack; - sp++; + sp[used_sacks].start_seq = ntohl(get_unaligned(&sp_wire[i].start_seq)); + sp[used_sacks].end_seq = ntohl(get_unaligned(&sp_wire[i].end_seq)); - if (!tcp_is_sackblock_valid(tp, dup_sack, start_seq, end_seq)) { + if (!tcp_is_sackblock_valid(tp, dup_sack, + sp[used_sacks].start_seq, + sp[used_sacks].end_seq)) { if (dup_sack) { if (!tp->undo_marker) NET_INC_STATS_BH(LINUX_MIB_TCPDSACKIGNOREDNOUNDO); @@ -1352,169 +1465,148 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ } else { /* Don't count olds caused by ACK reordering */ if ((TCP_SKB_CB(ack_skb)->ack_seq != tp->snd_una) && - !after(end_seq, tp->snd_una)) + !after(sp[used_sacks].end_seq, tp->snd_una)) continue; NET_INC_STATS_BH(LINUX_MIB_TCPSACKDISCARD); } + if (i == 0) + first_sack_index = -1; continue; } - skb = cached_skb; - fack_count = cached_fack_count; - - /* Event "B" in the comment above. */ - if (after(end_seq, tp->high_seq)) - flag |= FLAG_DATA_LOST; - - tcp_for_write_queue_from(skb, sk) { - int in_sack = 0; - u8 sacked; - - if (skb == tcp_send_head(sk)) - break; - - cached_skb = skb; - cached_fack_count = fack_count; - if (i == first_sack_index) { - tp->fastpath_skb_hint = skb; - tp->fastpath_cnt_hint = fack_count; - } + /* Ignore very old stuff early */ + if (!after(sp[used_sacks].end_seq, prior_snd_una)) + continue; - /* The retransmission queue is always in order, so - * we can short-circuit the walk early. - */ - if (!before(TCP_SKB_CB(skb)->seq, end_seq)) - break; + used_sacks++; + } - dup_sack = (found_dup_sack && (i == first_sack_index)); + /* order SACK blocks to allow in order walk of the retrans queue */ + for (i = used_sacks - 1; i > 0; i--) { + for (j = 0; j < i; j++) { + if (after(sp[j].start_seq, sp[j + 1].start_seq)) { + struct tcp_sack_block tmp; - /* Due to sorting DSACK may reside within this SACK block! */ - if (next_dup) { - u32 dup_start = ntohl(sp->start_seq); - u32 dup_end = ntohl(sp->end_seq); + tmp = sp[j]; + sp[j] = sp[j + 1]; + sp[j + 1] = tmp; - if (before(TCP_SKB_CB(skb)->seq, dup_end)) { - in_sack = tcp_match_skb_to_sack(sk, skb, dup_start, dup_end); - if (in_sack > 0) - dup_sack = 1; - } + /* Track where the first SACK block goes to */ + if (j == first_sack_index) + first_sack_index = j + 1; } + } + } - /* DSACK info lost if out-of-mem, try SACK still */ - if (in_sack <= 0) - in_sack = tcp_match_skb_to_sack(sk, skb, start_seq, end_seq); - if (unlikely(in_sack < 0)) - break; + skb = tcp_write_queue_head(sk); + fack_count = 0; + i = 0; - sacked = TCP_SKB_CB(skb)->sacked; + if (!tp->sacked_out) { + /* It's already past, so skip checking against it */ + cache = tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache); + } else { + cache = tp->recv_sack_cache; + /* Skip empty blocks in at head of the cache */ + while (tcp_sack_cache_ok(tp, cache) && !cache->start_seq && + !cache->end_seq) + cache++; + } - /* Account D-SACK for retransmitted packet. */ - if ((dup_sack && in_sack) && - (sacked & TCPCB_RETRANS) && - after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker)) - tp->undo_retrans--; + while (i < used_sacks) { + u32 start_seq = sp[i].start_seq; + u32 end_seq = sp[i].end_seq; + int dup_sack = (found_dup_sack && (i == first_sack_index)); + struct tcp_sack_block *next_dup = NULL; - /* The frame is ACKed. */ - if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) { - if (sacked&TCPCB_RETRANS) { - if ((dup_sack && in_sack) && - (sacked&TCPCB_SACKED_ACKED)) - reord = min(fack_count, reord); - } + if (found_dup_sack && ((i + 1) == first_sack_index)) + next_dup = &sp[i + 1]; - /* Nothing to do; acked frame is about to be dropped. */ - fack_count += tcp_skb_pcount(skb); - continue; - } + /* Event "B" in the comment above. */ + if (after(end_seq, tp->high_seq)) + flag |= FLAG_DATA_LOST; - if (!in_sack) { - fack_count += tcp_skb_pcount(skb); - continue; + /* Skip too early cached blocks */ + while (tcp_sack_cache_ok(tp, cache) && + !before(start_seq, cache->end_seq)) + cache++; + + /* Can skip some work by looking recv_sack_cache? */ + if (tcp_sack_cache_ok(tp, cache) && !dup_sack && + after(end_seq, cache->start_seq)) { + + /* Head todo? */ + if (before(start_seq, cache->start_seq)) { + skb = tcp_sacktag_skip(skb, sk, start_seq); + skb = tcp_sacktag_walk(skb, sk, next_dup, + start_seq, + cache->start_seq, + dup_sack, &fack_count, + &reord, &flag); } - if (!(sacked&TCPCB_SACKED_ACKED)) { - if (sacked & TCPCB_SACKED_RETRANS) { - /* If the segment is not tagged as lost, - * we do not clear RETRANS, believing - * that retransmission is still in flight. - */ - if (sacked & TCPCB_LOST) { - TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); - tp->lost_out -= tcp_skb_pcount(skb); - tp->retrans_out -= tcp_skb_pcount(skb); - - /* clear lost hint */ - tp->retransmit_skb_hint = NULL; - } - } else { - if (!(sacked & TCPCB_RETRANS)) { - /* New sack for not retransmitted frame, - * which was in hole. It is reordering. - */ - if (fack_count < prior_fackets) - reord = min(fack_count, reord); - - /* SACK enhanced F-RTO (RFC4138; Appendix B) */ - if (!after(TCP_SKB_CB(skb)->end_seq, tp->frto_highmark)) - flag |= FLAG_ONLY_ORIG_SACKED; - } - - if (sacked & TCPCB_LOST) { - TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; - tp->lost_out -= tcp_skb_pcount(skb); + /* Rest of the block already fully processed? */ + if (!after(end_seq, cache->end_seq)) + goto advance_sp; - /* clear lost hint */ - tp->retransmit_skb_hint = NULL; - } - } + skb = tcp_maybe_skipping_dsack(skb, sk, next_dup, + cache->end_seq, + &fack_count, &reord, + &flag); - TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED; - flag |= FLAG_DATA_SACKED; - tp->sacked_out += tcp_skb_pcount(skb); - - fack_count += tcp_skb_pcount(skb); - if (fack_count > tp->fackets_out) - tp->fackets_out = fack_count; - - if (after(TCP_SKB_CB(skb)->seq, tp->highest_sack)) { - tp->highest_sack = TCP_SKB_CB(skb)->seq; - highest_sack_end_seq = TCP_SKB_CB(skb)->end_seq; - } - } else { - if (dup_sack && (sacked&TCPCB_RETRANS)) - reord = min(fack_count, reord); - - fack_count += tcp_skb_pcount(skb); + /* ...tail remains todo... */ + if (tcp_highest_sack_seq(tp) == cache->end_seq) { + /* ...but better entrypoint exists! */ + skb = tcp_highest_sack(sk); + if (skb == NULL) + break; + fack_count = tp->fackets_out; + cache++; + goto walk; } - /* D-SACK. We can detect redundant retransmission - * in S|R and plain R frames and clear it. - * undo_retrans is decreased above, L|R frames - * are accounted above as well. - */ - if (dup_sack && - (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS)) { - TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; - tp->retrans_out -= tcp_skb_pcount(skb); - tp->retransmit_skb_hint = NULL; - } + skb = tcp_sacktag_skip(skb, sk, cache->end_seq); + /* Check overlap against next cached too (past this one already) */ + cache++; + continue; + } + + if (!before(start_seq, tcp_highest_sack_seq(tp))) { + skb = tcp_highest_sack(sk); + if (skb == NULL) + break; + fack_count = tp->fackets_out; } + skb = tcp_sacktag_skip(skb, sk, start_seq); + +walk: + skb = tcp_sacktag_walk(skb, sk, next_dup, start_seq, end_seq, + dup_sack, &fack_count, &reord, &flag); +advance_sp: /* SACK enhanced FRTO (RFC4138, Appendix B): Clearing correct * due to in-order walk */ if (after(end_seq, tp->frto_highmark)) flag &= ~FLAG_ONLY_ORIG_SACKED; + + i++; } - if (tp->retrans_out && - after(highest_sack_end_seq, tp->lost_retrans_low) && - icsk->icsk_ca_state == TCP_CA_Recovery) - flag |= tcp_mark_lost_retrans(sk, highest_sack_end_seq); + /* Clear the head of the cache sack blocks so we can skip it next time */ + for (i = 0; i < ARRAY_SIZE(tp->recv_sack_cache) - used_sacks; i++) { + tp->recv_sack_cache[i].start_seq = 0; + tp->recv_sack_cache[i].end_seq = 0; + } + for (j = 0; j < used_sacks; j++) + tp->recv_sack_cache[i++] = sp[j]; + + tcp_mark_lost_retrans(sk); tcp_verify_left_out(tp); - if ((reord < tp->fackets_out) && icsk->icsk_ca_state != TCP_CA_Loss && + if ((reord < tp->fackets_out) && + ((icsk->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker) && (!tp->frto_highmark || after(tp->snd_una, tp->frto_highmark))) tcp_update_reordering(sk, tp->fackets_out - reord, 0); @@ -1565,10 +1657,10 @@ static void tcp_remove_reno_sacks(struct sock *sk, int acked) if (acked > 0) { /* One ACK acked hole. The rest eat duplicate ACKs. */ - if (acked-1 >= tp->sacked_out) + if (acked - 1 >= tp->sacked_out) tp->sacked_out = 0; else - tp->sacked_out -= acked-1; + tp->sacked_out -= acked - 1; } tcp_check_reno_reordering(sk, acked); tcp_verify_left_out(tp); @@ -1602,10 +1694,10 @@ int tcp_use_frto(struct sock *sk) tcp_for_write_queue_from(skb, sk) { if (skb == tcp_send_head(sk)) break; - if (TCP_SKB_CB(skb)->sacked&TCPCB_RETRANS) + if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) return 0; /* Short-circuit when first non-SACKed skb has been checked */ - if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) + if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) break; } return 1; @@ -1715,7 +1807,7 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag) * Count the retransmission made on RTO correctly (only when * waiting for the first ACK and did not get it)... */ - if ((tp->frto_counter == 1) && !(flag&FLAG_DATA_ACKED)) { + if ((tp->frto_counter == 1) && !(flag & FLAG_DATA_ACKED)) { /* For some reason this R-bit might get cleared? */ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) tp->retrans_out += tcp_skb_pcount(skb); @@ -1728,7 +1820,7 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag) } /* Don't lost mark skbs that were fwd transmitted after RTO */ - if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) && + if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) && !after(TCP_SKB_CB(skb)->end_seq, tp->frto_highmark)) { TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tp->lost_out += tcp_skb_pcount(skb); @@ -1743,7 +1835,7 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag) tp->bytes_acked = 0; tp->reordering = min_t(unsigned int, tp->reordering, - sysctl_tcp_reordering); + sysctl_tcp_reordering); tcp_set_ca_state(sk, TCP_CA_Loss); tp->high_seq = tp->frto_highmark; TCP_ECN_queue_cwr(tp); @@ -1810,7 +1902,7 @@ void tcp_enter_loss(struct sock *sk, int how) if (skb == tcp_send_head(sk)) break; - if (TCP_SKB_CB(skb)->sacked&TCPCB_RETRANS) + if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) tp->undo_marker = 0; TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED; if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || how) { @@ -1822,7 +1914,7 @@ void tcp_enter_loss(struct sock *sk, int how) tcp_verify_left_out(tp); tp->reordering = min_t(unsigned int, tp->reordering, - sysctl_tcp_reordering); + sysctl_tcp_reordering); tcp_set_ca_state(sk, TCP_CA_Loss); tp->high_seq = tp->snd_nxt; TCP_ECN_queue_cwr(tp); @@ -1830,18 +1922,15 @@ void tcp_enter_loss(struct sock *sk, int how) tp->frto_counter = 0; } -static int tcp_check_sack_reneging(struct sock *sk) +/* If ACK arrived pointing to a remembered SACK, it means that our + * remembered SACKs do not reflect real state of receiver i.e. + * receiver _host_ is heavily congested (or buggy). + * + * Do processing similar to RTO timeout. + */ +static int tcp_check_sack_reneging(struct sock *sk, int flag) { - struct sk_buff *skb; - - /* If ACK arrived pointing to a remembered SACK, - * it means that our remembered SACKs do not reflect - * real state of receiver i.e. - * receiver _host_ is heavily congested (or buggy). - * Do processing similar to RTO timeout. - */ - if ((skb = tcp_write_queue_head(sk)) != NULL && - (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { + if (flag & FLAG_SACK_RENEGING) { struct inet_connection_sock *icsk = inet_csk(sk); NET_INC_STATS_BH(LINUX_MIB_TCPSACKRENEGING); @@ -1857,7 +1946,27 @@ static int tcp_check_sack_reneging(struct sock *sk) static inline int tcp_fackets_out(struct tcp_sock *tp) { - return tcp_is_reno(tp) ? tp->sacked_out+1 : tp->fackets_out; + return tcp_is_reno(tp) ? tp->sacked_out + 1 : tp->fackets_out; +} + +/* Heurestics to calculate number of duplicate ACKs. There's no dupACKs + * counter when SACK is enabled (without SACK, sacked_out is used for + * that purpose). + * + * Instead, with FACK TCP uses fackets_out that includes both SACKed + * segments up to the highest received SACK block so far and holes in + * between them. + * + * With reordering, holes may still be in flight, so RFC3517 recovery + * uses pure sacked_out (total number of SACKed segments) even though + * it violates the RFC that uses duplicate ACKs, often these are equal + * but when e.g. out-of-window ACKs or packet duplication occurs, + * they differ. Since neither occurs due to loss, TCP should really + * ignore them. + */ +static inline int tcp_dupack_heurestics(struct tcp_sock *tp) +{ + return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1; } static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb) @@ -1980,13 +2089,13 @@ static int tcp_time_to_recover(struct sock *sk) return 1; /* Not-A-Trick#2 : Classic rule... */ - if (tcp_fackets_out(tp) > tp->reordering) + if (tcp_dupack_heurestics(tp) > tp->reordering) return 1; /* Trick#3 : when we use RFC2988 timer restart, fast * retransmit can be triggered by timeout of queue head. */ - if (tcp_head_timedout(sk)) + if (tcp_is_fack(tp) && tcp_head_timedout(sk)) return 1; /* Trick#4: It is still not OK... But will it be useful to delay @@ -2010,17 +2119,18 @@ static int tcp_time_to_recover(struct sock *sk) * retransmitted past LOST markings in the first place? I'm not fully sure * about undo and end of connection cases, which can cause R without L? */ -static void tcp_verify_retransmit_hint(struct tcp_sock *tp, - struct sk_buff *skb) +static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) { if ((tp->retransmit_skb_hint != NULL) && before(TCP_SKB_CB(skb)->seq, - TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) + TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) tp->retransmit_skb_hint = NULL; } -/* Mark head of queue up as lost. */ -static void tcp_mark_head_lost(struct sock *sk, int packets) +/* Mark head of queue up as lost. With RFC3517 SACK, the packets is + * is against sacked "cnt", otherwise it's against facked "cnt" + */ +static void tcp_mark_head_lost(struct sock *sk, int packets, int fast_rexmit) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; @@ -2042,8 +2152,13 @@ static void tcp_mark_head_lost(struct sock *sk, int packets) /* this is not the most efficient way to do this... */ tp->lost_skb_hint = skb; tp->lost_cnt_hint = cnt; - cnt += tcp_skb_pcount(skb); - if (cnt > packets || after(TCP_SKB_CB(skb)->end_seq, tp->high_seq)) + + if (tcp_is_fack(tp) || + (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) + cnt += tcp_skb_pcount(skb); + + if (((!fast_rexmit || (tp->lost_out > 0)) && (cnt > packets)) || + after(TCP_SKB_CB(skb)->end_seq, tp->high_seq)) break; if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_SACKED_ACKED|TCPCB_LOST))) { TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; @@ -2056,17 +2171,22 @@ static void tcp_mark_head_lost(struct sock *sk, int packets) /* Account newly detected lost packet(s) */ -static void tcp_update_scoreboard(struct sock *sk) +static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit) { struct tcp_sock *tp = tcp_sk(sk); - if (tcp_is_fack(tp)) { + if (tcp_is_reno(tp)) { + tcp_mark_head_lost(sk, 1, fast_rexmit); + } else if (tcp_is_fack(tp)) { int lost = tp->fackets_out - tp->reordering; if (lost <= 0) lost = 1; - tcp_mark_head_lost(sk, lost); + tcp_mark_head_lost(sk, lost, fast_rexmit); } else { - tcp_mark_head_lost(sk, 1); + int sacked_upto = tp->sacked_out - tp->reordering; + if (sacked_upto < 0) + sacked_upto = 0; + tcp_mark_head_lost(sk, sacked_upto, fast_rexmit); } /* New heuristics: it is possible only after we switched @@ -2074,7 +2194,7 @@ static void tcp_update_scoreboard(struct sock *sk) * Hence, we can detect timed out packets during fast * retransmit without falling to slow start. */ - if (!tcp_is_reno(tp) && tcp_head_timedout(sk)) { + if (tcp_is_fack(tp) && tcp_head_timedout(sk)) { struct sk_buff *skb; skb = tp->scoreboard_skb_hint ? tp->scoreboard_skb_hint @@ -2105,7 +2225,7 @@ static void tcp_update_scoreboard(struct sock *sk) static inline void tcp_moderate_cwnd(struct tcp_sock *tp) { tp->snd_cwnd = min(tp->snd_cwnd, - tcp_packets_in_flight(tp)+tcp_max_burst(tp)); + tcp_packets_in_flight(tp) + tcp_max_burst(tp)); tp->snd_cwnd_stamp = tcp_time_stamp; } @@ -2125,15 +2245,15 @@ static void tcp_cwnd_down(struct sock *sk, int flag) struct tcp_sock *tp = tcp_sk(sk); int decr = tp->snd_cwnd_cnt + 1; - if ((flag&(FLAG_ANY_PROGRESS|FLAG_DSACKING_ACK)) || - (tcp_is_reno(tp) && !(flag&FLAG_NOT_DUP))) { - tp->snd_cwnd_cnt = decr&1; + if ((flag & (FLAG_ANY_PROGRESS | FLAG_DSACKING_ACK)) || + (tcp_is_reno(tp) && !(flag & FLAG_NOT_DUP))) { + tp->snd_cwnd_cnt = decr & 1; decr >>= 1; if (decr && tp->snd_cwnd > tcp_cwnd_min(sk)) tp->snd_cwnd -= decr; - tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1); + tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp) + 1); tp->snd_cwnd_stamp = tcp_time_stamp; } } @@ -2177,7 +2297,7 @@ static void tcp_undo_cwr(struct sock *sk, const int undo) if (icsk->icsk_ca_ops->undo_cwnd) tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk); else - tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1); + tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh << 1); if (undo && tp->prior_ssthresh > tp->snd_ssthresh) { tp->snd_ssthresh = tp->prior_ssthresh; @@ -2196,8 +2316,7 @@ static void tcp_undo_cwr(struct sock *sk, const int undo) static inline int tcp_may_undo(struct tcp_sock *tp) { - return tp->undo_marker && - (!tp->undo_retrans || tcp_packet_delayed(tp)); + return tp->undo_marker && (!tp->undo_retrans || tcp_packet_delayed(tp)); } /* People celebrate: "We love our President!" */ @@ -2247,7 +2366,7 @@ static int tcp_try_undo_partial(struct sock *sk, int acked) { struct tcp_sock *tp = tcp_sk(sk); /* Partial ACK arrived. Force Hoe's retransmit. */ - int failed = tcp_is_reno(tp) || tp->fackets_out>tp->reordering; + int failed = tcp_is_reno(tp) || (tcp_fackets_out(tp) > tp->reordering); if (tcp_may_undo(tp)) { /* Plain luck! Hole if filled with delayed @@ -2316,7 +2435,7 @@ static void tcp_try_to_open(struct sock *sk, int flag) if (tp->retrans_out == 0) tp->retrans_stamp = 0; - if (flag&FLAG_ECE) + if (flag & FLAG_ECE) tcp_enter_cwr(sk, 1); if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) { @@ -2362,7 +2481,6 @@ static void tcp_mtup_probe_success(struct sock *sk, struct sk_buff *skb) tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); } - /* Process an event, which can update packets-in-flight not trivially. * Main goal of this function is to calculate new estimate for left_out, * taking into account both packets sitting in receiver's buffer and @@ -2374,38 +2492,35 @@ static void tcp_mtup_probe_success(struct sock *sk, struct sk_buff *skb) * It does _not_ decide what to send, it is made in function * tcp_xmit_retransmit_queue(). */ -static void -tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag) +static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - int is_dupack = !(flag&(FLAG_SND_UNA_ADVANCED|FLAG_NOT_DUP)); - int do_lost = is_dupack || ((flag&FLAG_DATA_SACKED) && - (tp->fackets_out > tp->reordering)); + int is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); + int do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && + (tcp_fackets_out(tp) > tp->reordering)); + int fast_rexmit = 0; - /* Some technical things: - * 1. Reno does not count dupacks (sacked_out) automatically. */ - if (!tp->packets_out) + if (WARN_ON(!tp->packets_out && tp->sacked_out)) tp->sacked_out = 0; - if (WARN_ON(!tp->sacked_out && tp->fackets_out)) tp->fackets_out = 0; /* Now state machine starts. * A. ECE, hence prohibit cwnd undoing, the reduction is required. */ - if (flag&FLAG_ECE) + if (flag & FLAG_ECE) tp->prior_ssthresh = 0; /* B. In all the states check for reneging SACKs. */ - if (tp->sacked_out && tcp_check_sack_reneging(sk)) + if (tcp_check_sack_reneging(sk, flag)) return; /* C. Process data loss notification, provided it is valid. */ - if ((flag&FLAG_DATA_LOST) && + if (tcp_is_fack(tp) && (flag & FLAG_DATA_LOST) && before(tp->snd_una, tp->high_seq) && icsk->icsk_ca_state != TCP_CA_Open && tp->fackets_out > tp->reordering) { - tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering); + tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering, 0); NET_INC_STATS_BH(LINUX_MIB_TCPLOSS); } @@ -2465,7 +2580,7 @@ tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag) do_lost = tcp_try_undo_partial(sk, pkts_acked); break; case TCP_CA_Loss: - if (flag&FLAG_DATA_ACKED) + if (flag & FLAG_DATA_ACKED) icsk->icsk_retransmits = 0; if (!tcp_try_undo_loss(sk)) { tcp_moderate_cwnd(tp); @@ -2515,7 +2630,7 @@ tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag) tp->undo_retrans = tp->retrans_out; if (icsk->icsk_ca_state < TCP_CA_CWR) { - if (!(flag&FLAG_ECE)) + if (!(flag & FLAG_ECE)) tp->prior_ssthresh = tcp_current_ssthresh(sk); tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); TCP_ECN_queue_cwr(tp); @@ -2524,10 +2639,11 @@ tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag) tp->bytes_acked = 0; tp->snd_cwnd_cnt = 0; tcp_set_ca_state(sk, TCP_CA_Recovery); + fast_rexmit = 1; } - if (do_lost || tcp_head_timedout(sk)) - tcp_update_scoreboard(sk); + if (do_lost || (tcp_is_fack(tp) && tcp_head_timedout(sk))) + tcp_update_scoreboard(sk, fast_rexmit); tcp_cwnd_down(sk, flag); tcp_xmit_retransmit_queue(sk); } @@ -2591,11 +2707,10 @@ static inline void tcp_ack_update_rtt(struct sock *sk, const int flag, tcp_ack_no_tstamp(sk, seq_rtt, flag); } -static void tcp_cong_avoid(struct sock *sk, u32 ack, - u32 in_flight, int good) +static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) { const struct inet_connection_sock *icsk = inet_csk(sk); - icsk->icsk_ca_ops->cong_avoid(sk, ack, in_flight, good); + icsk->icsk_ca_ops->cong_avoid(sk, ack, in_flight); tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp; } @@ -2609,7 +2724,8 @@ static void tcp_rearm_rto(struct sock *sk) if (!tp->packets_out) { inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); } else { - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, TCP_RTO_MAX); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + inet_csk(sk)->icsk_rto, TCP_RTO_MAX); } } @@ -2638,8 +2754,7 @@ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) * is before the ack sequence we can discard it as it's confirmed to have * arrived at the other end. */ -static int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p, - int prior_fackets) +static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets) { struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); @@ -2647,8 +2762,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p, u32 now = tcp_time_stamp; int fully_acked = 1; int flag = 0; - int prior_packets = tp->packets_out; - u32 cnt = 0; + u32 pkts_acked = 0; u32 reord = tp->packets_out; s32 seq_rtt = -1; s32 ca_seq_rtt = -1; @@ -2657,7 +2771,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p, while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) { struct tcp_skb_cb *scb = TCP_SKB_CB(skb); u32 end_seq; - u32 packets_acked; + u32 acked_pcount; u8 sacked = scb->sacked; /* Determine how many packets and what bytes were acked, tso and else */ @@ -2666,14 +2780,14 @@ static int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p, !after(tp->snd_una, scb->seq)) break; - packets_acked = tcp_tso_acked(sk, skb); - if (!packets_acked) + acked_pcount = tcp_tso_acked(sk, skb); + if (!acked_pcount) break; fully_acked = 0; end_seq = tp->snd_una; } else { - packets_acked = tcp_skb_pcount(skb); + acked_pcount = tcp_skb_pcount(skb); end_seq = scb->end_seq; } @@ -2683,44 +2797,34 @@ static int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p, tcp_mtup_probe_success(sk, skb); } - if (sacked) { - if (sacked & TCPCB_RETRANS) { - if (sacked & TCPCB_SACKED_RETRANS) - tp->retrans_out -= packets_acked; - flag |= FLAG_RETRANS_DATA_ACKED; - ca_seq_rtt = -1; - seq_rtt = -1; - if ((flag & FLAG_DATA_ACKED) || - (packets_acked > 1)) - flag |= FLAG_NONHEAD_RETRANS_ACKED; - } else { - ca_seq_rtt = now - scb->when; - last_ackt = skb->tstamp; - if (seq_rtt < 0) { - seq_rtt = ca_seq_rtt; - } - if (!(sacked & TCPCB_SACKED_ACKED)) - reord = min(cnt, reord); - } - - if (sacked & TCPCB_SACKED_ACKED) - tp->sacked_out -= packets_acked; - if (sacked & TCPCB_LOST) - tp->lost_out -= packets_acked; - - if ((sacked & TCPCB_URG) && tp->urg_mode && - !before(end_seq, tp->snd_up)) - tp->urg_mode = 0; + if (sacked & TCPCB_RETRANS) { + if (sacked & TCPCB_SACKED_RETRANS) + tp->retrans_out -= acked_pcount; + flag |= FLAG_RETRANS_DATA_ACKED; + ca_seq_rtt = -1; + seq_rtt = -1; + if ((flag & FLAG_DATA_ACKED) || (acked_pcount > 1)) + flag |= FLAG_NONHEAD_RETRANS_ACKED; } else { ca_seq_rtt = now - scb->when; last_ackt = skb->tstamp; if (seq_rtt < 0) { seq_rtt = ca_seq_rtt; } - reord = min(cnt, reord); + if (!(sacked & TCPCB_SACKED_ACKED)) + reord = min(pkts_acked, reord); } - tp->packets_out -= packets_acked; - cnt += packets_acked; + + if (sacked & TCPCB_SACKED_ACKED) + tp->sacked_out -= acked_pcount; + if (sacked & TCPCB_LOST) + tp->lost_out -= acked_pcount; + + if (unlikely(tp->urg_mode && !before(end_seq, tp->snd_up))) + tp->urg_mode = 0; + + tp->packets_out -= acked_pcount; + pkts_acked += acked_pcount; /* Initial outgoing SYN's get put onto the write_queue * just like anything else we transmit. It is not @@ -2740,12 +2844,14 @@ static int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p, break; tcp_unlink_write_queue(skb, sk); - sk_stream_free_skb(sk, skb); + sk_wmem_free_skb(sk, skb); tcp_clear_all_retrans_hints(tp); } + if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) + flag |= FLAG_SACK_RENEGING; + if (flag & FLAG_ACKED) { - u32 pkts_acked = prior_packets - tp->packets_out; const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; @@ -2761,9 +2867,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p, } tp->fackets_out -= min(pkts_acked, tp->fackets_out); - /* hint's skb might be NULL but we don't need to care */ - tp->fastpath_cnt_hint -= min_t(u32, pkts_acked, - tp->fastpath_cnt_hint); + if (ca_ops->pkts_acked) { s32 rtt_us = -1; @@ -2806,7 +2910,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p, } } #endif - *seq_rtt_p = seq_rtt; return flag; } @@ -2817,8 +2920,7 @@ static void tcp_ack_probe(struct sock *sk) /* Was it a usable window open? */ - if (!after(TCP_SKB_CB(tcp_send_head(sk))->end_seq, - tp->snd_una + tp->snd_wnd)) { + if (!after(TCP_SKB_CB(tcp_send_head(sk))->end_seq, tcp_wnd_end(tp))) { icsk->icsk_backoff = 0; inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0); /* Socket must be waked up by subsequent tcp_data_snd_check(). @@ -2847,8 +2949,9 @@ static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag) /* Check that window update is acceptable. * The function assumes that snd_una<=ack<=snd_next. */ -static inline int tcp_may_update_window(const struct tcp_sock *tp, const u32 ack, - const u32 ack_seq, const u32 nwin) +static inline int tcp_may_update_window(const struct tcp_sock *tp, + const u32 ack, const u32 ack_seq, + const u32 nwin) { return (after(ack, tp->snd_una) || after(ack_seq, tp->snd_wl1) || @@ -2917,7 +3020,7 @@ static void tcp_ratehalving_spur_to_response(struct sock *sk) static void tcp_undo_spur_to_response(struct sock *sk, int flag) { - if (flag&FLAG_ECE) + if (flag & FLAG_ECE) tcp_ratehalving_spur_to_response(sk); else tcp_undo_cwr(sk, 1); @@ -2960,7 +3063,7 @@ static int tcp_process_frto(struct sock *sk, int flag) tcp_verify_left_out(tp); /* Duplicate the behavior from Loss state (fastretrans_alert) */ - if (flag&FLAG_DATA_ACKED) + if (flag & FLAG_DATA_ACKED) inet_csk(sk)->icsk_retransmits = 0; if ((flag & FLAG_NONHEAD_RETRANS_ACKED) || @@ -2977,16 +3080,16 @@ static int tcp_process_frto(struct sock *sk, int flag) * ACK isn't duplicate nor advances window, e.g., opposite dir * data, winupdate */ - if (!(flag&FLAG_ANY_PROGRESS) && (flag&FLAG_NOT_DUP)) + if (!(flag & FLAG_ANY_PROGRESS) && (flag & FLAG_NOT_DUP)) return 1; - if (!(flag&FLAG_DATA_ACKED)) { + if (!(flag & FLAG_DATA_ACKED)) { tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 0 : 3), flag); return 1; } } else { - if (!(flag&FLAG_DATA_ACKED) && (tp->frto_counter == 1)) { + if (!(flag & FLAG_DATA_ACKED) && (tp->frto_counter == 1)) { /* Prevent sending of new data. */ tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)); @@ -2994,10 +3097,12 @@ static int tcp_process_frto(struct sock *sk, int flag) } if ((tp->frto_counter >= 2) && - (!(flag&FLAG_FORWARD_PROGRESS) || - ((flag&FLAG_DATA_SACKED) && !(flag&FLAG_ONLY_ORIG_SACKED)))) { + (!(flag & FLAG_FORWARD_PROGRESS) || + ((flag & FLAG_DATA_SACKED) && + !(flag & FLAG_ONLY_ORIG_SACKED)))) { /* RFC4138 shortcoming (see comment above) */ - if (!(flag&FLAG_FORWARD_PROGRESS) && (flag&FLAG_NOT_DUP)) + if (!(flag & FLAG_FORWARD_PROGRESS) && + (flag & FLAG_NOT_DUP)) return 1; tcp_enter_frto_loss(sk, 3, flag); @@ -3043,7 +3148,6 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) u32 ack = TCP_SKB_CB(skb)->ack_seq; u32 prior_in_flight; u32 prior_fackets; - s32 seq_rtt; int prior_packets; int frto_cwnd = 0; @@ -3064,13 +3168,14 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) tp->bytes_acked += ack - prior_snd_una; else if (icsk->icsk_ca_state == TCP_CA_Loss) /* we assume just one segment left network */ - tp->bytes_acked += min(ack - prior_snd_una, tp->mss_cache); + tp->bytes_acked += min(ack - prior_snd_una, + tp->mss_cache); } prior_fackets = tp->fackets_out; prior_in_flight = tcp_packets_in_flight(tp); - if (!(flag&FLAG_SLOWPATH) && after(ack, prior_snd_una)) { + if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) { /* Window is constant, pure forward advance. * No more checks are required. * Note, we use the fact that SND.UNA>=SND.WL2. @@ -3109,7 +3214,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) goto no_queue; /* See if we can take anything off of the retransmit queue. */ - flag |= tcp_clean_rtx_queue(sk, &seq_rtt, prior_fackets); + flag |= tcp_clean_rtx_queue(sk, prior_fackets); if (tp->frto_counter) frto_cwnd = tcp_process_frto(sk, flag); @@ -3121,14 +3226,15 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) /* Advance CWND, if state allows this. */ if ((flag & FLAG_DATA_ACKED) && !frto_cwnd && tcp_may_raise_cwnd(sk, flag)) - tcp_cong_avoid(sk, ack, prior_in_flight, 0); - tcp_fastretrans_alert(sk, prior_packets - tp->packets_out, flag); + tcp_cong_avoid(sk, ack, prior_in_flight); + tcp_fastretrans_alert(sk, prior_packets - tp->packets_out, + flag); } else { if ((flag & FLAG_DATA_ACKED) && !frto_cwnd) - tcp_cong_avoid(sk, ack, prior_in_flight, 1); + tcp_cong_avoid(sk, ack, prior_in_flight); } - if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP)) + if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) dst_confirm(sk->sk_dst_cache); return 1; @@ -3153,100 +3259,99 @@ uninteresting_ack: return 0; } - /* Look for tcp options. Normally only called on SYN and SYNACK packets. * But, this can also be called on packets in the established flow when * the fast version below fails. */ -void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, int estab) +void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, + int estab) { unsigned char *ptr; struct tcphdr *th = tcp_hdr(skb); - int length=(th->doff*4)-sizeof(struct tcphdr); + int length = (th->doff * 4) - sizeof(struct tcphdr); ptr = (unsigned char *)(th + 1); opt_rx->saw_tstamp = 0; while (length > 0) { - int opcode=*ptr++; + int opcode = *ptr++; int opsize; switch (opcode) { - case TCPOPT_EOL: + case TCPOPT_EOL: + return; + case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ + length--; + continue; + default: + opsize = *ptr++; + if (opsize < 2) /* "silly options" */ return; - case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ - length--; - continue; - default: - opsize=*ptr++; - if (opsize < 2) /* "silly options" */ - return; - if (opsize > length) - return; /* don't parse partial options */ - switch (opcode) { - case TCPOPT_MSS: - if (opsize==TCPOLEN_MSS && th->syn && !estab) { - u16 in_mss = ntohs(get_unaligned((__be16 *)ptr)); - if (in_mss) { - if (opt_rx->user_mss && opt_rx->user_mss < in_mss) - in_mss = opt_rx->user_mss; - opt_rx->mss_clamp = in_mss; - } - } - break; - case TCPOPT_WINDOW: - if (opsize==TCPOLEN_WINDOW && th->syn && !estab) - if (sysctl_tcp_window_scaling) { - __u8 snd_wscale = *(__u8 *) ptr; - opt_rx->wscale_ok = 1; - if (snd_wscale > 14) { - if (net_ratelimit()) - printk(KERN_INFO "tcp_parse_options: Illegal window " - "scaling value %d >14 received.\n", - snd_wscale); - snd_wscale = 14; - } - opt_rx->snd_wscale = snd_wscale; - } - break; - case TCPOPT_TIMESTAMP: - if (opsize==TCPOLEN_TIMESTAMP) { - if ((estab && opt_rx->tstamp_ok) || - (!estab && sysctl_tcp_timestamps)) { - opt_rx->saw_tstamp = 1; - opt_rx->rcv_tsval = ntohl(get_unaligned((__be32 *)ptr)); - opt_rx->rcv_tsecr = ntohl(get_unaligned((__be32 *)(ptr+4))); - } + if (opsize > length) + return; /* don't parse partial options */ + switch (opcode) { + case TCPOPT_MSS: + if (opsize == TCPOLEN_MSS && th->syn && !estab) { + u16 in_mss = ntohs(get_unaligned((__be16 *)ptr)); + if (in_mss) { + if (opt_rx->user_mss && + opt_rx->user_mss < in_mss) + in_mss = opt_rx->user_mss; + opt_rx->mss_clamp = in_mss; } - break; - case TCPOPT_SACK_PERM: - if (opsize==TCPOLEN_SACK_PERM && th->syn && !estab) { - if (sysctl_tcp_sack) { - opt_rx->sack_ok = 1; - tcp_sack_reset(opt_rx); - } + } + break; + case TCPOPT_WINDOW: + if (opsize == TCPOLEN_WINDOW && th->syn && + !estab && sysctl_tcp_window_scaling) { + __u8 snd_wscale = *(__u8 *)ptr; + opt_rx->wscale_ok = 1; + if (snd_wscale > 14) { + if (net_ratelimit()) + printk(KERN_INFO "tcp_parse_options: Illegal window " + "scaling value %d >14 received.\n", + snd_wscale); + snd_wscale = 14; } - break; + opt_rx->snd_wscale = snd_wscale; + } + break; + case TCPOPT_TIMESTAMP: + if ((opsize == TCPOLEN_TIMESTAMP) && + ((estab && opt_rx->tstamp_ok) || + (!estab && sysctl_tcp_timestamps))) { + opt_rx->saw_tstamp = 1; + opt_rx->rcv_tsval = ntohl(get_unaligned((__be32 *)ptr)); + opt_rx->rcv_tsecr = ntohl(get_unaligned((__be32 *)(ptr+4))); + } + break; + case TCPOPT_SACK_PERM: + if (opsize == TCPOLEN_SACK_PERM && th->syn && + !estab && sysctl_tcp_sack) { + opt_rx->sack_ok = 1; + tcp_sack_reset(opt_rx); + } + break; - case TCPOPT_SACK: - if ((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) && - !((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK) && - opt_rx->sack_ok) { - TCP_SKB_CB(skb)->sacked = (ptr - 2) - (unsigned char *)th; - } - break; + case TCPOPT_SACK: + if ((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) && + !((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK) && + opt_rx->sack_ok) { + TCP_SKB_CB(skb)->sacked = (ptr - 2) - (unsigned char *)th; + } + break; #ifdef CONFIG_TCP_MD5SIG - case TCPOPT_MD5SIG: - /* - * The MD5 Hash has already been - * checked (see tcp_v{4,6}_do_rcv()). - */ - break; + case TCPOPT_MD5SIG: + /* + * The MD5 Hash has already been + * checked (see tcp_v{4,6}_do_rcv()). + */ + break; #endif - } + } - ptr+=opsize-2; - length-=opsize; + ptr += opsize-2; + length -= opsize; } } } @@ -3257,7 +3362,7 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th, struct tcp_sock *tp) { - if (th->doff == sizeof(struct tcphdr)>>2) { + if (th->doff == sizeof(struct tcphdr) >> 2) { tp->rx_opt.saw_tstamp = 0; return 0; } else if (tp->rx_opt.tstamp_ok && @@ -3342,7 +3447,8 @@ static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb) (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ); } -static inline int tcp_paws_discard(const struct sock *sk, const struct sk_buff *skb) +static inline int tcp_paws_discard(const struct sock *sk, + const struct sk_buff *skb) { const struct tcp_sock *tp = tcp_sk(sk); return ((s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) > TCP_PAWS_WINDOW && @@ -3374,16 +3480,16 @@ static void tcp_reset(struct sock *sk) { /* We want the right error as BSD sees it (and indeed as we do). */ switch (sk->sk_state) { - case TCP_SYN_SENT: - sk->sk_err = ECONNREFUSED; - break; - case TCP_CLOSE_WAIT: - sk->sk_err = EPIPE; - break; - case TCP_CLOSE: - return; - default: - sk->sk_err = ECONNRESET; + case TCP_SYN_SENT: + sk->sk_err = ECONNREFUSED; + break; + case TCP_CLOSE_WAIT: + sk->sk_err = EPIPE; + break; + case TCP_CLOSE: + return; + default: + sk->sk_err = ECONNRESET; } if (!sock_flag(sk, SOCK_DEAD)) @@ -3416,43 +3522,43 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) sock_set_flag(sk, SOCK_DONE); switch (sk->sk_state) { - case TCP_SYN_RECV: - case TCP_ESTABLISHED: - /* Move to CLOSE_WAIT */ - tcp_set_state(sk, TCP_CLOSE_WAIT); - inet_csk(sk)->icsk_ack.pingpong = 1; - break; + case TCP_SYN_RECV: + case TCP_ESTABLISHED: + /* Move to CLOSE_WAIT */ + tcp_set_state(sk, TCP_CLOSE_WAIT); + inet_csk(sk)->icsk_ack.pingpong = 1; + break; - case TCP_CLOSE_WAIT: - case TCP_CLOSING: - /* Received a retransmission of the FIN, do - * nothing. - */ - break; - case TCP_LAST_ACK: - /* RFC793: Remain in the LAST-ACK state. */ - break; + case TCP_CLOSE_WAIT: + case TCP_CLOSING: + /* Received a retransmission of the FIN, do + * nothing. + */ + break; + case TCP_LAST_ACK: + /* RFC793: Remain in the LAST-ACK state. */ + break; - case TCP_FIN_WAIT1: - /* This case occurs when a simultaneous close - * happens, we must ack the received FIN and - * enter the CLOSING state. - */ - tcp_send_ack(sk); - tcp_set_state(sk, TCP_CLOSING); - break; - case TCP_FIN_WAIT2: - /* Received a FIN -- send ACK and enter TIME_WAIT. */ - tcp_send_ack(sk); - tcp_time_wait(sk, TCP_TIME_WAIT, 0); - break; - default: - /* Only TCP_LISTEN and TCP_CLOSE are left, in these - * cases we should never reach this piece of code. - */ - printk(KERN_ERR "%s: Impossible, sk->sk_state=%d\n", - __FUNCTION__, sk->sk_state); - break; + case TCP_FIN_WAIT1: + /* This case occurs when a simultaneous close + * happens, we must ack the received FIN and + * enter the CLOSING state. + */ + tcp_send_ack(sk); + tcp_set_state(sk, TCP_CLOSING); + break; + case TCP_FIN_WAIT2: + /* Received a FIN -- send ACK and enter TIME_WAIT. */ + tcp_send_ack(sk); + tcp_time_wait(sk, TCP_TIME_WAIT, 0); + break; + default: + /* Only TCP_LISTEN and TCP_CLOSE are left, in these + * cases we should never reach this piece of code. + */ + printk(KERN_ERR "%s: Impossible, sk->sk_state=%d\n", + __FUNCTION__, sk->sk_state); + break; } /* It _is_ possible, that we have something out-of-order _after_ FIN. @@ -3461,7 +3567,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) __skb_queue_purge(&tp->out_of_order_queue); if (tcp_is_sack(tp)) tcp_sack_reset(&tp->rx_opt); - sk_stream_mem_reclaim(sk); + sk_mem_reclaim(sk); if (!sock_flag(sk, SOCK_DEAD)) { sk->sk_state_change(sk); @@ -3469,13 +3575,14 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) /* Do not send POLL_HUP for half duplex close. */ if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE) - sk_wake_async(sk, 1, POLL_HUP); + sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); else - sk_wake_async(sk, 1, POLL_IN); + sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); } } -static inline int tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, u32 end_seq) +static inline int tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, + u32 end_seq) { if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) { if (before(seq, sp->start_seq)) @@ -3498,7 +3605,8 @@ static void tcp_dsack_set(struct tcp_sock *tp, u32 seq, u32 end_seq) tp->rx_opt.dsack = 1; tp->duplicate_sack[0].start_seq = seq; tp->duplicate_sack[0].end_seq = end_seq; - tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + 1, 4 - tp->rx_opt.tstamp_ok); + tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + 1, + 4 - tp->rx_opt.tstamp_ok); } } @@ -3538,12 +3646,12 @@ static void tcp_sack_maybe_coalesce(struct tcp_sock *tp) { int this_sack; struct tcp_sack_block *sp = &tp->selective_acks[0]; - struct tcp_sack_block *swalk = sp+1; + struct tcp_sack_block *swalk = sp + 1; /* See if the recent change to the first SACK eats into * or hits the sequence space of other SACK blocks, if so coalesce. */ - for (this_sack = 1; this_sack < tp->rx_opt.num_sacks; ) { + for (this_sack = 1; this_sack < tp->rx_opt.num_sacks;) { if (tcp_sack_extend(sp, swalk->start_seq, swalk->end_seq)) { int i; @@ -3551,16 +3659,19 @@ static void tcp_sack_maybe_coalesce(struct tcp_sock *tp) * Decrease num_sacks. */ tp->rx_opt.num_sacks--; - tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + tp->rx_opt.dsack, 4 - tp->rx_opt.tstamp_ok); - for (i=this_sack; i < tp->rx_opt.num_sacks; i++) - sp[i] = sp[i+1]; + tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + + tp->rx_opt.dsack, + 4 - tp->rx_opt.tstamp_ok); + for (i = this_sack; i < tp->rx_opt.num_sacks; i++) + sp[i] = sp[i + 1]; continue; } this_sack++, swalk++; } } -static inline void tcp_sack_swap(struct tcp_sack_block *sack1, struct tcp_sack_block *sack2) +static inline void tcp_sack_swap(struct tcp_sack_block *sack1, + struct tcp_sack_block *sack2) { __u32 tmp; @@ -3583,11 +3694,11 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq) if (!cur_sacks) goto new_sack; - for (this_sack=0; this_sack<cur_sacks; this_sack++, sp++) { + for (this_sack = 0; this_sack < cur_sacks; this_sack++, sp++) { if (tcp_sack_extend(sp, seq, end_seq)) { /* Rotate this_sack to the first one. */ - for (; this_sack>0; this_sack--, sp--) - tcp_sack_swap(sp, sp-1); + for (; this_sack > 0; this_sack--, sp--) + tcp_sack_swap(sp, sp - 1); if (cur_sacks > 1) tcp_sack_maybe_coalesce(tp); return; @@ -3606,14 +3717,15 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq) sp--; } for (; this_sack > 0; this_sack--, sp--) - *sp = *(sp-1); + *sp = *(sp - 1); new_sack: /* Build the new head SACK, and we're done. */ sp->start_seq = seq; sp->end_seq = end_seq; tp->rx_opt.num_sacks++; - tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + tp->rx_opt.dsack, 4 - tp->rx_opt.tstamp_ok); + tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + tp->rx_opt.dsack, + 4 - tp->rx_opt.tstamp_ok); } /* RCV.NXT advances, some SACKs should be eaten. */ @@ -3631,7 +3743,7 @@ static void tcp_sack_remove(struct tcp_sock *tp) return; } - for (this_sack = 0; this_sack < num_sacks; ) { + for (this_sack = 0; this_sack < num_sacks;) { /* Check if the start of the sack is covered by RCV.NXT. */ if (!before(tp->rcv_nxt, sp->start_seq)) { int i; @@ -3650,7 +3762,9 @@ static void tcp_sack_remove(struct tcp_sock *tp) } if (num_sacks != tp->rx_opt.num_sacks) { tp->rx_opt.num_sacks = num_sacks; - tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + tp->rx_opt.dsack, 4 - tp->rx_opt.tstamp_ok); + tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + + tp->rx_opt.dsack, + 4 - tp->rx_opt.tstamp_ok); } } @@ -3703,14 +3817,14 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) goto drop; - __skb_pull(skb, th->doff*4); + __skb_pull(skb, th->doff * 4); TCP_ECN_accept_cwr(tp, skb); if (tp->rx_opt.dsack) { tp->rx_opt.dsack = 0; tp->rx_opt.eff_sacks = min_t(unsigned int, tp->rx_opt.num_sacks, - 4 - tp->rx_opt.tstamp_ok); + 4 - tp->rx_opt.tstamp_ok); } /* Queue data for delivery to the user. @@ -3726,7 +3840,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) tp->copied_seq == tp->rcv_nxt && tp->ucopy.len && sock_owned_by_user(sk) && !tp->urg_data) { int chunk = min_t(unsigned int, skb->len, - tp->ucopy.len); + tp->ucopy.len); __set_current_state(TASK_RUNNING); @@ -3744,12 +3858,12 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) queue_and_out: if (eaten < 0 && (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || - !sk_stream_rmem_schedule(sk, skb))) { + !sk_rmem_schedule(sk, skb->truesize))) { if (tcp_prune_queue(sk) < 0 || - !sk_stream_rmem_schedule(sk, skb)) + !sk_rmem_schedule(sk, skb->truesize)) goto drop; } - sk_stream_set_owner_r(skb, sk); + skb_set_owner_r(skb, sk); __skb_queue_tail(&sk->sk_receive_queue, skb); } tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; @@ -3818,9 +3932,9 @@ drop: TCP_ECN_check_ce(tp, skb); if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || - !sk_stream_rmem_schedule(sk, skb)) { + !sk_rmem_schedule(sk, skb->truesize)) { if (tcp_prune_queue(sk) < 0 || - !sk_stream_rmem_schedule(sk, skb)) + !sk_rmem_schedule(sk, skb->truesize)) goto drop; } @@ -3831,7 +3945,7 @@ drop: SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n", tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); - sk_stream_set_owner_r(skb, sk); + skb_set_owner_r(skb, sk); if (!skb_peek(&tp->out_of_order_queue)) { /* Initial out of order segment, build 1 SACK. */ @@ -3843,7 +3957,7 @@ drop: tp->selective_acks[0].end_seq = TCP_SKB_CB(skb)->end_seq; } - __skb_queue_head(&tp->out_of_order_queue,skb); + __skb_queue_head(&tp->out_of_order_queue, skb); } else { struct sk_buff *skb1 = tp->out_of_order_queue.prev; u32 seq = TCP_SKB_CB(skb)->seq; @@ -3866,10 +3980,10 @@ drop: if (!after(TCP_SKB_CB(skb1)->seq, seq)) break; } while ((skb1 = skb1->prev) != - (struct sk_buff*)&tp->out_of_order_queue); + (struct sk_buff *)&tp->out_of_order_queue); /* Do skb overlap to previous one? */ - if (skb1 != (struct sk_buff*)&tp->out_of_order_queue && + if (skb1 != (struct sk_buff *)&tp->out_of_order_queue && before(seq, TCP_SKB_CB(skb1)->end_seq)) { if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { /* All the bits are present. Drop. */ @@ -3879,7 +3993,8 @@ drop: } if (after(seq, TCP_SKB_CB(skb1)->seq)) { /* Partial overlap. */ - tcp_dsack_set(tp, seq, TCP_SKB_CB(skb1)->end_seq); + tcp_dsack_set(tp, seq, + TCP_SKB_CB(skb1)->end_seq); } else { skb1 = skb1->prev; } @@ -3888,15 +4003,17 @@ drop: /* And clean segments covered by new one as whole. */ while ((skb1 = skb->next) != - (struct sk_buff*)&tp->out_of_order_queue && + (struct sk_buff *)&tp->out_of_order_queue && after(end_seq, TCP_SKB_CB(skb1)->seq)) { - if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) { - tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, end_seq); - break; - } - __skb_unlink(skb1, &tp->out_of_order_queue); - tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, TCP_SKB_CB(skb1)->end_seq); - __kfree_skb(skb1); + if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) { + tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, + end_seq); + break; + } + __skb_unlink(skb1, &tp->out_of_order_queue); + tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, + TCP_SKB_CB(skb1)->end_seq); + __kfree_skb(skb1); } add_sack: @@ -3919,7 +4036,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, /* First, check that queue is collapsible and find * the point where collapsing can be useful. */ - for (skb = head; skb != tail; ) { + for (skb = head; skb != tail;) { /* No new bits? It is possible on ofo queue. */ if (!before(start, TCP_SKB_CB(skb)->end_seq)) { struct sk_buff *next = skb->next; @@ -3957,9 +4074,9 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, /* Too big header? This can happen with IPv6. */ if (copy < 0) return; - if (end-start < copy) - copy = end-start; - nskb = alloc_skb(copy+header, GFP_ATOMIC); + if (end - start < copy) + copy = end - start; + nskb = alloc_skb(copy + header, GFP_ATOMIC); if (!nskb) return; @@ -3973,7 +4090,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; __skb_insert(nskb, skb->prev, skb, list); - sk_stream_set_owner_r(nskb, sk); + skb_set_owner_r(nskb, sk); /* Copy data, releasing collapsed skbs. */ while (copy > 0) { @@ -4069,9 +4186,9 @@ static int tcp_prune_queue(struct sock *sk) tcp_collapse_ofo_queue(sk); tcp_collapse(sk, &sk->sk_receive_queue, sk->sk_receive_queue.next, - (struct sk_buff*)&sk->sk_receive_queue, + (struct sk_buff *)&sk->sk_receive_queue, tp->copied_seq, tp->rcv_nxt); - sk_stream_mem_reclaim(sk); + sk_mem_reclaim(sk); if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) return 0; @@ -4091,7 +4208,7 @@ static int tcp_prune_queue(struct sock *sk) */ if (tcp_is_sack(tp)) tcp_sack_reset(&tp->rx_opt); - sk_stream_mem_reclaim(sk); + sk_mem_reclaim(sk); } if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) @@ -4108,7 +4225,6 @@ static int tcp_prune_queue(struct sock *sk) return -1; } - /* RFC2861, slow part. Adjust cwnd, after it was not full during one rto. * As additional protections, we do not touch cwnd in retransmission phases, * and if application hit its sndbuf limit recently. @@ -4170,8 +4286,8 @@ static void tcp_new_space(struct sock *sk) int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff), demanded = max_t(unsigned int, tp->snd_cwnd, - tp->reordering + 1); - sndmem *= 2*demanded; + tp->reordering + 1); + sndmem *= 2 * demanded; if (sndmem > sk->sk_sndbuf) sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]); tp->snd_cwnd_stamp = tcp_time_stamp; @@ -4212,8 +4328,7 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) /* We ACK each frame or... */ tcp_in_quickack_mode(sk) || /* We have out of order data. */ - (ofo_possible && - skb_peek(&tp->out_of_order_queue))) { + (ofo_possible && skb_peek(&tp->out_of_order_queue))) { /* Then ack it now */ tcp_send_ack(sk); } else { @@ -4241,7 +4356,7 @@ static inline void tcp_ack_snd_check(struct sock *sk) * either form (or just set the sysctl tcp_stdurg). */ -static void tcp_check_urg(struct sock * sk, struct tcphdr * th) +static void tcp_check_urg(struct sock *sk, struct tcphdr *th) { struct tcp_sock *tp = tcp_sk(sk); u32 ptr = ntohs(th->urg_ptr); @@ -4290,8 +4405,7 @@ static void tcp_check_urg(struct sock * sk, struct tcphdr * th) * buggy users. */ if (tp->urg_seq == tp->copied_seq && tp->urg_data && - !sock_flag(sk, SOCK_URGINLINE) && - tp->copied_seq != tp->rcv_nxt) { + !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) { struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); tp->copied_seq++; if (skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)) { @@ -4300,8 +4414,8 @@ static void tcp_check_urg(struct sock * sk, struct tcphdr * th) } } - tp->urg_data = TCP_URG_NOTYET; - tp->urg_seq = ptr; + tp->urg_data = TCP_URG_NOTYET; + tp->urg_seq = ptr; /* Disable header prediction. */ tp->pred_flags = 0; @@ -4314,7 +4428,7 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, struct tcphdr *th) /* Check if we get a new urgent pointer - normally not. */ if (th->urg) - tcp_check_urg(sk,th); + tcp_check_urg(sk, th); /* Do we wait for any urgent data? - normally not... */ if (tp->urg_data == TCP_URG_NOTYET) { @@ -4356,7 +4470,8 @@ static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen) return err; } -static __sum16 __tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb) +static __sum16 __tcp_checksum_complete_user(struct sock *sk, + struct sk_buff *skb) { __sum16 result; @@ -4370,14 +4485,16 @@ static __sum16 __tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb return result; } -static inline int tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb) +static inline int tcp_checksum_complete_user(struct sock *sk, + struct sk_buff *skb) { return !skb_csum_unnecessary(skb) && - __tcp_checksum_complete_user(sk, skb); + __tcp_checksum_complete_user(sk, skb); } #ifdef CONFIG_NET_DMA -static int tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, int hlen) +static int tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, + int hlen) { struct tcp_sock *tp = tcp_sk(sk); int chunk = skb->len - hlen; @@ -4393,7 +4510,9 @@ static int tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, int hlen if (tp->ucopy.dma_chan && skb_csum_unnecessary(skb)) { dma_cookie = dma_skb_copy_datagram_iovec(tp->ucopy.dma_chan, - skb, hlen, tp->ucopy.iov, chunk, tp->ucopy.pinned_list); + skb, hlen, + tp->ucopy.iov, chunk, + tp->ucopy.pinned_list); if (dma_cookie < 0) goto out; @@ -4475,7 +4594,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, */ if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags && - TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { + TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { int tcp_header_len = tp->tcp_header_len; /* Timestamp header prediction: tcp_header_len @@ -4544,7 +4663,8 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, eaten = 1; } #endif - if (tp->ucopy.task == current && sock_owned_by_user(sk) && !copied_early) { + if (tp->ucopy.task == current && + sock_owned_by_user(sk) && !copied_early) { __set_current_state(TASK_RUNNING); if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) @@ -4591,9 +4711,9 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, NET_INC_STATS_BH(LINUX_MIB_TCPHPHITS); /* Bulk data transfer: receiver */ - __skb_pull(skb,tcp_header_len); + __skb_pull(skb, tcp_header_len); __skb_queue_tail(&sk->sk_receive_queue, skb); - sk_stream_set_owner_r(skb, sk); + skb_set_owner_r(skb, sk); tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; } @@ -4623,7 +4743,7 @@ no_ack: } slow_path: - if (len < (th->doff<<2) || tcp_checksum_complete_user(sk, skb)) + if (len < (th->doff << 2) || tcp_checksum_complete_user(sk, skb)) goto csum_error; /* @@ -4830,7 +4950,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, if (!sock_flag(sk, SOCK_DEAD)) { sk->sk_state_change(sk); - sk_wake_async(sk, 0, POLL_OUT); + sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT); } if (sk->sk_write_pending || @@ -4873,7 +4993,8 @@ discard: } /* PAWS check. */ - if (tp->rx_opt.ts_recent_stamp && tp->rx_opt.saw_tstamp && tcp_paws_check(&tp->rx_opt, 0)) + if (tp->rx_opt.ts_recent_stamp && tp->rx_opt.saw_tstamp && + tcp_paws_check(&tp->rx_opt, 0)) goto discard_and_undo; if (th->syn) { @@ -4908,7 +5029,6 @@ discard: tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); tcp_initialize_rcv_mss(sk); - tcp_send_synack(sk); #if 0 /* Note, we could accept data and URG from this segment. @@ -4940,7 +5060,6 @@ reset_and_undo: return 1; } - /* * This function implements the receiving procedure of RFC 793 for * all states except ESTABLISHED and TIME_WAIT. @@ -5060,9 +5179,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, * are not waked up, because sk->sk_sleep == * NULL and sk->sk_socket == NULL. */ - if (sk->sk_socket) { - sk_wake_async(sk,0,POLL_OUT); - } + if (sk->sk_socket) + sk_wake_async(sk, + SOCK_WAKE_IO, POLL_OUT); tp->snd_una = TCP_SKB_CB(skb)->ack_seq; tp->snd_wnd = ntohs(th->window) << @@ -5074,8 +5193,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, * and does not calculate rtt. * Fix it at least with timestamps. */ - if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && - !tp->srtt) + if (tp->rx_opt.saw_tstamp && + tp->rx_opt.rcv_tsecr && !tp->srtt) tcp_ack_saw_tstamp(sk, 0); if (tp->rx_opt.tstamp_ok) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 652c32368ccc..9aea88b8d4fc 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -99,7 +99,7 @@ static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key, __be32 saddr, __be32 daddr, struct tcphdr *th, int protocol, - int tcplen); + unsigned int tcplen); #endif struct inet_hashinfo __cacheline_aligned tcp_hashinfo = { @@ -1020,7 +1020,7 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval, static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key, __be32 saddr, __be32 daddr, struct tcphdr *th, int protocol, - int tcplen) + unsigned int tcplen) { struct scatterlist sg[4]; __u16 data_len; @@ -1113,7 +1113,7 @@ int tcp_v4_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key, struct dst_entry *dst, struct request_sock *req, struct tcphdr *th, int protocol, - int tcplen) + unsigned int tcplen) { __be32 saddr, daddr; @@ -1478,7 +1478,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, } #endif - __inet_hash(&tcp_hashinfo, newsk, 0); + __inet_hash_nolisten(&tcp_hashinfo, newsk); __inet_inherit_port(&tcp_hashinfo, sk, newsk); return newsk; diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c index e7f5ef92cbd8..ce3c41ff50b2 100644 --- a/net/ipv4/tcp_lp.c +++ b/net/ipv4/tcp_lp.c @@ -115,12 +115,12 @@ static void tcp_lp_init(struct sock *sk) * Will only call newReno CA when away from inference. * From TCP-LP's paper, this will be handled in additive increasement. */ -static void tcp_lp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight, int flag) +static void tcp_lp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) { struct lp *lp = inet_csk_ca(sk); if (!(lp->flag & LP_WITHIN_INF)) - tcp_reno_cong_avoid(sk, ack, in_flight, flag); + tcp_reno_cong_avoid(sk, ack, in_flight); } /** diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f4c1eef89af0..89f0188885c7 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -61,27 +61,24 @@ int sysctl_tcp_base_mss __read_mostly = 512; /* By default, RFC2861 behavior. */ int sysctl_tcp_slow_start_after_idle __read_mostly = 1; -static inline void tcp_packets_out_inc(struct sock *sk, - const struct sk_buff *skb) +static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); - int orig = tp->packets_out; + unsigned int prior_packets = tp->packets_out; + + tcp_advance_send_head(sk, skb); + tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; + + /* Don't override Nagle indefinately with F-RTO */ + if (tp->frto_counter == 2) + tp->frto_counter = 3; tp->packets_out += tcp_skb_pcount(skb); - if (!orig) + if (!prior_packets) inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, TCP_RTO_MAX); } -static void update_send_head(struct sock *sk, struct sk_buff *skb) -{ - struct tcp_sock *tp = tcp_sk(sk); - - tcp_advance_send_head(sk, skb); - tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; - tcp_packets_out_inc(sk, skb); -} - /* SND.NXT, if window was not shrunk. * If window has been shrunk, what should we make? It is not clear at all. * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-( @@ -92,10 +89,10 @@ static inline __u32 tcp_acceptable_seq(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - if (!before(tp->snd_una+tp->snd_wnd, tp->snd_nxt)) + if (!before(tcp_wnd_end(tp), tp->snd_nxt)) return tp->snd_nxt; else - return tp->snd_una+tp->snd_wnd; + return tcp_wnd_end(tp); } /* Calculate mss to advertise in SYN segment. @@ -224,14 +221,14 @@ void tcp_select_initial_window(int __space, __u32 mss, * following RFC2414. Senders, not following this RFC, * will be satisfied with 2. */ - if (mss > (1<<*rcv_wscale)) { + if (mss > (1 << *rcv_wscale)) { int init_cwnd = 4; - if (mss > 1460*3) + if (mss > 1460 * 3) init_cwnd = 2; else if (mss > 1460) init_cwnd = 3; - if (*rcv_wnd > init_cwnd*mss) - *rcv_wnd = init_cwnd*mss; + if (*rcv_wnd > init_cwnd * mss) + *rcv_wnd = init_cwnd * mss; } /* Set the clamp no higher than max representable value */ @@ -281,11 +278,10 @@ static u16 tcp_select_window(struct sock *sk) return new_win; } -static inline void TCP_ECN_send_synack(struct tcp_sock *tp, - struct sk_buff *skb) +static inline void TCP_ECN_send_synack(struct tcp_sock *tp, struct sk_buff *skb) { TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_CWR; - if (!(tp->ecn_flags&TCP_ECN_OK)) + if (!(tp->ecn_flags & TCP_ECN_OK)) TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_ECE; } @@ -295,7 +291,7 @@ static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb) tp->ecn_flags = 0; if (sysctl_tcp_ecn) { - TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ECE|TCPCB_FLAG_CWR; + TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ECE | TCPCB_FLAG_CWR; tp->ecn_flags = TCP_ECN_OK; } } @@ -317,7 +313,7 @@ static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb, if (skb->len != tcp_header_len && !before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) { INET_ECN_xmit(sk); - if (tp->ecn_flags&TCP_ECN_QUEUE_CWR) { + if (tp->ecn_flags & TCP_ECN_QUEUE_CWR) { tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR; tcp_hdr(skb)->cwr = 1; skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; @@ -331,6 +327,26 @@ static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb, } } +/* Constructs common control bits of non-data skb. If SYN/FIN is present, + * auto increment end seqno. + */ +static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) +{ + skb->csum = 0; + + TCP_SKB_CB(skb)->flags = flags; + TCP_SKB_CB(skb)->sacked = 0; + + skb_shinfo(skb)->gso_segs = 1; + skb_shinfo(skb)->gso_size = 0; + skb_shinfo(skb)->gso_type = 0; + + TCP_SKB_CB(skb)->seq = seq; + if (flags & (TCPCB_FLAG_SYN | TCPCB_FLAG_FIN)) + seq++; + TCP_SKB_CB(skb)->end_seq = seq; +} + static void tcp_build_and_update_options(__be32 *ptr, struct tcp_sock *tp, __u32 tstamp, __u8 **md5_hash) { @@ -434,7 +450,7 @@ static void tcp_syn_build_options(__be32 *ptr, int mss, int ts, int sack, (TCPOPT_NOP << 16) | (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG); - *md5_hash = (__u8 *) ptr; + *md5_hash = (__u8 *)ptr; } #endif } @@ -450,7 +466,8 @@ static void tcp_syn_build_options(__be32 *ptr, int mss, int ts, int sack, * We are working here with either a clone of the original * SKB, or a fresh unique copy made by the retransmit engine. */ -static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, gfp_t gfp_mask) +static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, + gfp_t gfp_mask) { const struct inet_connection_sock *icsk = inet_csk(sk); struct inet_sock *inet; @@ -554,8 +571,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, th->urg_ptr = 0; if (unlikely(tp->urg_mode && - between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF))) { - th->urg_ptr = htons(tp->snd_up-tcb->seq); + between(tp->snd_up, tcb->seq + 1, tcb->seq + 0xFFFF))) { + th->urg_ptr = htons(tp->snd_up - tcb->seq); th->urg = 1; } @@ -619,7 +636,6 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, #undef SYSCTL_FLAG_SACK } - /* This routine just queue's the buffer * * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames, @@ -633,10 +649,12 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) tp->write_seq = TCP_SKB_CB(skb)->end_seq; skb_header_release(skb); tcp_add_write_queue_tail(sk, skb); - sk_charge_skb(sk, skb); + sk->sk_wmem_queued += skb->truesize; + sk_mem_charge(sk, skb->truesize); } -static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now) +static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, + unsigned int mss_now) { if (skb->len <= mss_now || !sk_can_gso(sk)) { /* Avoid the costly divide in the normal @@ -653,23 +671,18 @@ static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned } /* When a modification to fackets out becomes necessary, we need to check - * skb is counted to fackets_out or not. Another important thing is to - * tweak SACK fastpath hint too as it would overwrite all changes unless - * hint is also changed. + * skb is counted to fackets_out or not. */ -static void tcp_adjust_fackets_out(struct tcp_sock *tp, struct sk_buff *skb, +static void tcp_adjust_fackets_out(struct sock *sk, struct sk_buff *skb, int decr) { + struct tcp_sock *tp = tcp_sk(sk); + if (!tp->sacked_out || tcp_is_reno(tp)) return; - if (!before(tp->highest_sack, TCP_SKB_CB(skb)->seq)) + if (after(tcp_highest_sack_seq(tp), TCP_SKB_CB(skb)->seq)) tp->fackets_out -= decr; - - /* cnt_hint is "off-by-one" compared with fackets_out (see sacktag) */ - if (tp->fastpath_skb_hint != NULL && - after(TCP_SKB_CB(tp->fastpath_skb_hint)->seq, TCP_SKB_CB(skb)->seq)) - tp->fastpath_cnt_hint -= decr; } /* Function to create two new TCP segments. Shrinks the given segment @@ -677,7 +690,8 @@ static void tcp_adjust_fackets_out(struct tcp_sock *tp, struct sk_buff *skb, * packet to the list. This won't be called frequently, I hope. * Remember, these are still headerless SKBs at this point. */ -int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss_now) +int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, + unsigned int mss_now) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *buff; @@ -702,7 +716,8 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss if (buff == NULL) return -ENOMEM; /* We'll just try again later. */ - sk_charge_skb(sk, buff); + sk->sk_wmem_queued += buff->truesize; + sk_mem_charge(sk, buff->truesize); nlen = skb->len - len - nsize; buff->truesize += nlen; skb->truesize -= nlen; @@ -712,20 +727,16 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq; TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq; - if (tcp_is_sack(tp) && tp->sacked_out && - (TCP_SKB_CB(skb)->seq == tp->highest_sack)) - tp->highest_sack = TCP_SKB_CB(buff)->seq; - /* PSH and FIN should only be set in the second packet. */ flags = TCP_SKB_CB(skb)->flags; - TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH); + TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN | TCPCB_FLAG_PSH); TCP_SKB_CB(buff)->flags = flags; TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked; - TCP_SKB_CB(skb)->sacked &= ~TCPCB_AT_TAIL; if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_PARTIAL) { /* Copy and checksum data tail into the new buffer. */ - buff->csum = csum_partial_copy_nocheck(skb->data + len, skb_put(buff, nsize), + buff->csum = csum_partial_copy_nocheck(skb->data + len, + skb_put(buff, nsize), nsize, 0); skb_trim(skb, len); @@ -772,7 +783,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss tcp_dec_pcount_approx_int(&tp->sacked_out, diff); tcp_verify_left_out(tp); } - tcp_adjust_fackets_out(tp, skb, diff); + tcp_adjust_fackets_out(sk, skb, diff); } /* Link BUFF into the send queue. */ @@ -792,7 +803,7 @@ static void __pskb_trim_head(struct sk_buff *skb, int len) eat = len; k = 0; - for (i=0; i<skb_shinfo(skb)->nr_frags; i++) { + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { if (skb_shinfo(skb)->frags[i].size <= eat) { put_page(skb_shinfo(skb)->frags[i].page); eat -= skb_shinfo(skb)->frags[i].size; @@ -815,8 +826,7 @@ static void __pskb_trim_head(struct sk_buff *skb, int len) int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) { - if (skb_cloned(skb) && - pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + if (skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) return -ENOMEM; /* If len == headlen, we avoid __skb_pull to preserve alignment. */ @@ -830,7 +840,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) skb->truesize -= len; sk->sk_wmem_queued -= len; - sk->sk_forward_alloc += len; + sk_mem_uncharge(sk, len); sock_set_flag(sk, SOCK_QUEUE_SHRUNK); /* Any change of skb->len requires recalculation of tso @@ -898,6 +908,15 @@ void tcp_mtup_init(struct sock *sk) icsk->icsk_mtup.probe_size = 0; } +/* Bound MSS / TSO packet size with the half of the window */ +static int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize) +{ + if (tp->max_window && pktsize > (tp->max_window >> 1)) + return max(tp->max_window >> 1, 68U - tp->tcp_header_len); + else + return pktsize; +} + /* This function synchronize snd mss to current pmtu/exthdr set. tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts @@ -920,7 +939,6 @@ void tcp_mtup_init(struct sock *sk) NOTE2. inet_csk(sk)->icsk_pmtu_cookie and tp->mss_cache are READ ONLY outside this function. --ANK (980731) */ - unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) { struct tcp_sock *tp = tcp_sk(sk); @@ -931,10 +949,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) icsk->icsk_mtup.search_high = pmtu; mss_now = tcp_mtu_to_mss(sk, pmtu); - - /* Bound mss with half of window */ - if (tp->max_window && mss_now > (tp->max_window>>1)) - mss_now = max((tp->max_window>>1), 68U - tp->tcp_header_len); + mss_now = tcp_bound_to_half_wnd(tp, mss_now); /* And store cached results */ icsk->icsk_pmtu_cookie = pmtu; @@ -988,11 +1003,7 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed) inet_csk(sk)->icsk_ext_hdr_len - tp->tcp_header_len); - if (tp->max_window && - (xmit_size_goal > (tp->max_window >> 1))) - xmit_size_goal = max((tp->max_window >> 1), - 68U - tp->tcp_header_len); - + xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal); xmit_size_goal -= (xmit_size_goal % mss_now); } tp->xmit_size_goal = xmit_size_goal; @@ -1001,13 +1012,11 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed) } /* Congestion window validation. (RFC2861) */ - static void tcp_cwnd_validate(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - __u32 packets_out = tp->packets_out; - if (packets_out >= tp->snd_cwnd) { + if (tp->packets_out >= tp->snd_cwnd) { /* Network is feed fully. */ tp->snd_cwnd_used = 0; tp->snd_cwnd_stamp = tcp_time_stamp; @@ -1022,19 +1031,35 @@ static void tcp_cwnd_validate(struct sock *sk) } } -static unsigned int tcp_window_allows(struct tcp_sock *tp, struct sk_buff *skb, unsigned int mss_now, unsigned int cwnd) +/* Returns the portion of skb which can be sent right away without + * introducing MSS oddities to segment boundaries. In rare cases where + * mss_now != mss_cache, we will request caller to create a small skb + * per input skb which could be mostly avoided here (if desired). + */ +static unsigned int tcp_mss_split_point(struct sock *sk, struct sk_buff *skb, + unsigned int mss_now, unsigned int cwnd) { - u32 window, cwnd_len; + struct tcp_sock *tp = tcp_sk(sk); + u32 needed, window, cwnd_len; - window = (tp->snd_una + tp->snd_wnd - TCP_SKB_CB(skb)->seq); + window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; cwnd_len = mss_now * cwnd; - return min(window, cwnd_len); + + if (likely(cwnd_len <= window && skb != tcp_write_queue_tail(sk))) + return cwnd_len; + + if (skb == tcp_write_queue_tail(sk) && cwnd_len <= skb->len) + return cwnd_len; + + needed = min(skb->len, window); + return needed - needed % mss_now; } /* Can at least one segment of SKB be sent right now, according to the * congestion window rules? If so, return how many segments are allowed. */ -static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *skb) +static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, + struct sk_buff *skb) { u32 in_flight, cwnd; @@ -1054,13 +1079,12 @@ static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *sk /* This must be invoked the first time we consider transmitting * SKB onto the wire. */ -static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now) +static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, + unsigned int mss_now) { int tso_segs = tcp_skb_pcount(skb); - if (!tso_segs || - (tso_segs > 1 && - tcp_skb_mss(skb) != mss_now)) { + if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) { tcp_set_skb_tso_segs(sk, skb, mss_now); tso_segs = tcp_skb_pcount(skb); } @@ -1080,16 +1104,13 @@ static inline int tcp_minshall_check(const struct tcp_sock *tp) * 4. Or TCP_CORK is not set, and all sent packets are ACKed. * With Minshall's modification: all sent small packets are ACKed. */ - static inline int tcp_nagle_check(const struct tcp_sock *tp, const struct sk_buff *skb, unsigned mss_now, int nonagle) { return (skb->len < mss_now && - ((nonagle&TCP_NAGLE_CORK) || - (!nonagle && - tp->packets_out && - tcp_minshall_check(tp)))); + ((nonagle & TCP_NAGLE_CORK) || + (!nonagle && tp->packets_out && tcp_minshall_check(tp)))); } /* Return non-zero if the Nagle test allows this packet to be @@ -1121,14 +1142,15 @@ static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb, } /* Does at least the first segment of SKB fit into the send window? */ -static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb, unsigned int cur_mss) +static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb, + unsigned int cur_mss) { u32 end_seq = TCP_SKB_CB(skb)->end_seq; if (skb->len > cur_mss) end_seq = TCP_SKB_CB(skb)->seq + cur_mss; - return !after(end_seq, tp->snd_una + tp->snd_wnd); + return !after(end_seq, tcp_wnd_end(tp)); } /* This checks if the data bearing packet SKB (usually tcp_send_head(sk)) @@ -1147,8 +1169,7 @@ static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb, return 0; cwnd_quota = tcp_cwnd_test(tp, skb); - if (cwnd_quota && - !tcp_snd_wnd_test(tp, skb, cur_mss)) + if (cwnd_quota && !tcp_snd_wnd_test(tp, skb, cur_mss)) cwnd_quota = 0; return cwnd_quota; @@ -1172,7 +1193,8 @@ int tcp_may_send_now(struct sock *sk) * know that all the data is in scatter-gather pages, and that the * packet has never been sent out before (and thus is not cloned). */ -static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, unsigned int mss_now) +static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, + unsigned int mss_now) { struct sk_buff *buff; int nlen = skb->len - len; @@ -1182,11 +1204,12 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, if (skb->len != skb->data_len) return tcp_fragment(sk, skb, len, mss_now); - buff = sk_stream_alloc_pskb(sk, 0, 0, GFP_ATOMIC); + buff = sk_stream_alloc_skb(sk, 0, GFP_ATOMIC); if (unlikely(buff == NULL)) return -ENOMEM; - sk_charge_skb(sk, buff); + sk->sk_wmem_queued += buff->truesize; + sk_mem_charge(sk, buff->truesize); buff->truesize += nlen; skb->truesize -= nlen; @@ -1197,7 +1220,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, /* PSH and FIN should only be set in the second packet. */ flags = TCP_SKB_CB(skb)->flags; - TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH); + TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN | TCPCB_FLAG_PSH); TCP_SKB_CB(buff)->flags = flags; /* This packet was never sent out yet, so no SACK bits. */ @@ -1235,15 +1258,15 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) goto send_now; /* Defer for less than two clock ticks. */ - if (!tp->tso_deferred && ((jiffies<<1)>>1) - (tp->tso_deferred>>1) > 1) + if (tp->tso_deferred && + ((jiffies << 1) >> 1) - (tp->tso_deferred >> 1) > 1) goto send_now; in_flight = tcp_packets_in_flight(tp); - BUG_ON(tcp_skb_pcount(skb) <= 1 || - (tp->snd_cwnd <= in_flight)); + BUG_ON(tcp_skb_pcount(skb) <= 1 || (tp->snd_cwnd <= in_flight)); - send_win = (tp->snd_una + tp->snd_wnd) - TCP_SKB_CB(skb)->seq; + send_win = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; /* From in_flight test above, we know that cwnd > in_flight. */ cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache; @@ -1274,7 +1297,7 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) } /* Ok, it looks like it is advisable to defer. */ - tp->tso_deferred = 1 | (jiffies<<1); + tp->tso_deferred = 1 | (jiffies << 1); return 1; @@ -1286,7 +1309,8 @@ send_now: /* Create a new MTU probe if we are ready. * Returns 0 if we should wait to probe (no cwnd available), * 1 if a probe was sent, - * -1 otherwise */ + * -1 otherwise + */ static int tcp_mtu_probe(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); @@ -1295,7 +1319,6 @@ static int tcp_mtu_probe(struct sock *sk) int len; int probe_size; int size_needed; - unsigned int pif; int copy; int mss_now; @@ -1312,7 +1335,7 @@ static int tcp_mtu_probe(struct sock *sk) /* Very simple search strategy: just double the MSS. */ mss_now = tcp_current_mss(sk, 0); - probe_size = 2*tp->mss_cache; + probe_size = 2 * tp->mss_cache; size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache; if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) { /* TODO: set timer for probe_converge_event */ @@ -1325,14 +1348,12 @@ static int tcp_mtu_probe(struct sock *sk) if (tp->snd_wnd < size_needed) return -1; - if (after(tp->snd_nxt + size_needed, tp->snd_una + tp->snd_wnd)) + if (after(tp->snd_nxt + size_needed, tcp_wnd_end(tp))) return 0; - /* Do we need to wait to drain cwnd? */ - pif = tcp_packets_in_flight(tp); - if (pif + 2 > tp->snd_cwnd) { - /* With no packets in flight, don't stall. */ - if (pif == 0) + /* Do we need to wait to drain cwnd? With none in flight, don't stall */ + if (tcp_packets_in_flight(tp) + 2 > tp->snd_cwnd) { + if (!tcp_packets_in_flight(tp)) return -1; else return 0; @@ -1341,10 +1362,10 @@ static int tcp_mtu_probe(struct sock *sk) /* We're allowed to probe. Build it now. */ if ((nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC)) == NULL) return -1; - sk_charge_skb(sk, nskb); + sk->sk_wmem_queued += nskb->truesize; + sk_mem_charge(sk, nskb->truesize); skb = tcp_send_head(sk); - tcp_insert_write_queue_before(nskb, skb, sk); TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq; TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size; @@ -1353,30 +1374,32 @@ static int tcp_mtu_probe(struct sock *sk) nskb->csum = 0; nskb->ip_summed = skb->ip_summed; - len = 0; - while (len < probe_size) { - next = tcp_write_queue_next(sk, skb); + tcp_insert_write_queue_before(nskb, skb, sk); + len = 0; + tcp_for_write_queue_from_safe(skb, next, sk) { copy = min_t(int, skb->len, probe_size - len); if (nskb->ip_summed) skb_copy_bits(skb, 0, skb_put(nskb, copy), copy); else nskb->csum = skb_copy_and_csum_bits(skb, 0, - skb_put(nskb, copy), copy, nskb->csum); + skb_put(nskb, copy), + copy, nskb->csum); if (skb->len <= copy) { /* We've eaten all the data from this skb. * Throw it away. */ TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags; tcp_unlink_write_queue(skb, sk); - sk_stream_free_skb(sk, skb); + sk_wmem_free_skb(sk, skb); } else { TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH); if (!skb_shinfo(skb)->nr_frags) { skb_pull(skb, copy); if (skb->ip_summed != CHECKSUM_PARTIAL) - skb->csum = csum_partial(skb->data, skb->len, 0); + skb->csum = csum_partial(skb->data, + skb->len, 0); } else { __pskb_trim_head(skb, copy); tcp_set_skb_tso_segs(sk, skb, mss_now); @@ -1385,7 +1408,9 @@ static int tcp_mtu_probe(struct sock *sk) } len += copy; - skb = next; + + if (len >= probe_size) + break; } tcp_init_tso_segs(sk, nskb, nskb->len); @@ -1394,9 +1419,9 @@ static int tcp_mtu_probe(struct sock *sk) TCP_SKB_CB(nskb)->when = tcp_time_stamp; if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) { /* Decrement cwnd here because we are sending - * effectively two packets. */ + * effectively two packets. */ tp->snd_cwnd--; - update_send_head(sk, nskb); + tcp_event_new_data_sent(sk, nskb); icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len); tp->mtu_probe.probe_seq_start = TCP_SKB_CB(nskb)->seq; @@ -1408,7 +1433,6 @@ static int tcp_mtu_probe(struct sock *sk) return -1; } - /* This routine writes packets to the network. It advances the * send_head. This happens as incoming acks open up the remote * window for us. @@ -1464,17 +1488,9 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) } limit = mss_now; - if (tso_segs > 1) { - limit = tcp_window_allows(tp, skb, - mss_now, cwnd_quota); - - if (skb->len < limit) { - unsigned int trim = skb->len % mss_now; - - if (trim) - limit = skb->len - trim; - } - } + if (tso_segs > 1) + limit = tcp_mss_split_point(sk, skb, mss_now, + cwnd_quota); if (skb->len > limit && unlikely(tso_fragment(sk, skb, limit, mss_now))) @@ -1488,7 +1504,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) /* Advance the send_head. This one is sent out. * This call will increment packets_out. */ - update_send_head(sk, skb); + tcp_event_new_data_sent(sk, skb); tcp_minshall_update(tp, mss_now, skb); sent_pkts++; @@ -1521,7 +1537,6 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, */ void tcp_push_one(struct sock *sk, unsigned int mss_now) { - struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb = tcp_send_head(sk); unsigned int tso_segs, cwnd_quota; @@ -1536,17 +1551,9 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now) BUG_ON(!tso_segs); limit = mss_now; - if (tso_segs > 1) { - limit = tcp_window_allows(tp, skb, - mss_now, cwnd_quota); - - if (skb->len < limit) { - unsigned int trim = skb->len % mss_now; - - if (trim) - limit = skb->len - trim; - } - } + if (tso_segs > 1) + limit = tcp_mss_split_point(sk, skb, mss_now, + cwnd_quota); if (skb->len > limit && unlikely(tso_fragment(sk, skb, limit, mss_now))) @@ -1556,7 +1563,7 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now) TCP_SKB_CB(skb)->when = tcp_time_stamp; if (likely(!tcp_transmit_skb(sk, skb, 1, sk->sk_allocation))) { - update_send_head(sk, skb); + tcp_event_new_data_sent(sk, skb); tcp_cwnd_validate(sk); return; } @@ -1633,11 +1640,12 @@ u32 __tcp_select_window(struct sock *sk) if (mss > full_space) mss = full_space; - if (free_space < full_space/2) { + if (free_space < (full_space >> 1)) { icsk->icsk_ack.quick = 0; if (tcp_memory_pressure) - tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U*tp->advmss); + tp->rcv_ssthresh = min(tp->rcv_ssthresh, + 4U * tp->advmss); if (free_space < mss) return 0; @@ -1670,9 +1678,9 @@ u32 __tcp_select_window(struct sock *sk) * is too small. */ if (window <= free_space - mss || window > free_space) - window = (free_space/mss)*mss; + window = (free_space / mss) * mss; else if (mss == full_space && - free_space > window + full_space/2) + free_space > window + (full_space >> 1)) window = free_space; } @@ -1680,86 +1688,82 @@ u32 __tcp_select_window(struct sock *sk) } /* Attempt to collapse two adjacent SKB's during retransmission. */ -static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int mss_now) +static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, + int mss_now) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *next_skb = tcp_write_queue_next(sk, skb); + int skb_size, next_skb_size; + u16 flags; /* The first test we must make is that neither of these two * SKB's are still referenced by someone else. */ - if (!skb_cloned(skb) && !skb_cloned(next_skb)) { - int skb_size = skb->len, next_skb_size = next_skb->len; - u16 flags = TCP_SKB_CB(skb)->flags; + if (skb_cloned(skb) || skb_cloned(next_skb)) + return; - /* Also punt if next skb has been SACK'd. */ - if (TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_ACKED) - return; + skb_size = skb->len; + next_skb_size = next_skb->len; + flags = TCP_SKB_CB(skb)->flags; - /* Next skb is out of window. */ - if (after(TCP_SKB_CB(next_skb)->end_seq, tp->snd_una+tp->snd_wnd)) - return; + /* Also punt if next skb has been SACK'd. */ + if (TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_ACKED) + return; - /* Punt if not enough space exists in the first SKB for - * the data in the second, or the total combined payload - * would exceed the MSS. - */ - if ((next_skb_size > skb_tailroom(skb)) || - ((skb_size + next_skb_size) > mss_now)) - return; + /* Next skb is out of window. */ + if (after(TCP_SKB_CB(next_skb)->end_seq, tcp_wnd_end(tp))) + return; - BUG_ON(tcp_skb_pcount(skb) != 1 || - tcp_skb_pcount(next_skb) != 1); + /* Punt if not enough space exists in the first SKB for + * the data in the second, or the total combined payload + * would exceed the MSS. + */ + if ((next_skb_size > skb_tailroom(skb)) || + ((skb_size + next_skb_size) > mss_now)) + return; - if (WARN_ON(tcp_is_sack(tp) && tp->sacked_out && - (TCP_SKB_CB(next_skb)->seq == tp->highest_sack))) - return; + BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1); - /* Ok. We will be able to collapse the packet. */ - tcp_unlink_write_queue(next_skb, sk); + tcp_highest_sack_combine(sk, next_skb, skb); - skb_copy_from_linear_data(next_skb, - skb_put(skb, next_skb_size), - next_skb_size); + /* Ok. We will be able to collapse the packet. */ + tcp_unlink_write_queue(next_skb, sk); - if (next_skb->ip_summed == CHECKSUM_PARTIAL) - skb->ip_summed = CHECKSUM_PARTIAL; + skb_copy_from_linear_data(next_skb, skb_put(skb, next_skb_size), + next_skb_size); - if (skb->ip_summed != CHECKSUM_PARTIAL) - skb->csum = csum_block_add(skb->csum, next_skb->csum, skb_size); + if (next_skb->ip_summed == CHECKSUM_PARTIAL) + skb->ip_summed = CHECKSUM_PARTIAL; - /* Update sequence range on original skb. */ - TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq; + if (skb->ip_summed != CHECKSUM_PARTIAL) + skb->csum = csum_block_add(skb->csum, next_skb->csum, skb_size); - /* Merge over control information. */ - flags |= TCP_SKB_CB(next_skb)->flags; /* This moves PSH/FIN etc. over */ - TCP_SKB_CB(skb)->flags = flags; + /* Update sequence range on original skb. */ + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq; - /* All done, get rid of second SKB and account for it so - * packet counting does not break. - */ - TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked&(TCPCB_EVER_RETRANS|TCPCB_AT_TAIL); - if (TCP_SKB_CB(next_skb)->sacked&TCPCB_SACKED_RETRANS) - tp->retrans_out -= tcp_skb_pcount(next_skb); - if (TCP_SKB_CB(next_skb)->sacked&TCPCB_LOST) - tp->lost_out -= tcp_skb_pcount(next_skb); - /* Reno case is special. Sigh... */ - if (tcp_is_reno(tp) && tp->sacked_out) - tcp_dec_pcount_approx(&tp->sacked_out, next_skb); - - tcp_adjust_fackets_out(tp, next_skb, tcp_skb_pcount(next_skb)); - tp->packets_out -= tcp_skb_pcount(next_skb); - - /* changed transmit queue under us so clear hints */ - tcp_clear_retrans_hints_partial(tp); - /* manually tune sacktag skb hint */ - if (tp->fastpath_skb_hint == next_skb) { - tp->fastpath_skb_hint = skb; - tp->fastpath_cnt_hint -= tcp_skb_pcount(skb); - } + /* Merge over control information. */ + flags |= TCP_SKB_CB(next_skb)->flags; /* This moves PSH/FIN etc. over */ + TCP_SKB_CB(skb)->flags = flags; - sk_stream_free_skb(sk, next_skb); - } + /* All done, get rid of second SKB and account for it so + * packet counting does not break. + */ + TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked & TCPCB_EVER_RETRANS; + if (TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_RETRANS) + tp->retrans_out -= tcp_skb_pcount(next_skb); + if (TCP_SKB_CB(next_skb)->sacked & TCPCB_LOST) + tp->lost_out -= tcp_skb_pcount(next_skb); + /* Reno case is special. Sigh... */ + if (tcp_is_reno(tp) && tp->sacked_out) + tcp_dec_pcount_approx(&tp->sacked_out, next_skb); + + tcp_adjust_fackets_out(sk, next_skb, tcp_skb_pcount(next_skb)); + tp->packets_out -= tcp_skb_pcount(next_skb); + + /* changed transmit queue under us so clear hints */ + tcp_clear_retrans_hints_partial(tp); + + sk_wmem_free_skb(sk, next_skb); } /* Do a simple retransmit without using the backoff mechanisms in @@ -1778,12 +1782,12 @@ void tcp_simple_retransmit(struct sock *sk) if (skb == tcp_send_head(sk)) break; if (skb->len > mss && - !(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) { - if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) { + !(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { + if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; tp->retrans_out -= tcp_skb_pcount(skb); } - if (!(TCP_SKB_CB(skb)->sacked&TCPCB_LOST)) { + if (!(TCP_SKB_CB(skb)->sacked & TCPCB_LOST)) { TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tp->lost_out += tcp_skb_pcount(skb); lost = 1; @@ -1848,7 +1852,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) * case, when window is shrunk to zero. In this case * our retransmit serves as a zero window probe. */ - if (!before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd) + if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp)) && TCP_SKB_CB(skb)->seq != tp->snd_una) return -EAGAIN; @@ -1862,8 +1866,10 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) (skb->len < (cur_mss >> 1)) && (tcp_write_queue_next(sk, skb) != tcp_send_head(sk)) && (!tcp_skb_is_last(sk, skb)) && - (skb_shinfo(skb)->nr_frags == 0 && skb_shinfo(tcp_write_queue_next(sk, skb))->nr_frags == 0) && - (tcp_skb_pcount(skb) == 1 && tcp_skb_pcount(tcp_write_queue_next(sk, skb)) == 1) && + (skb_shinfo(skb)->nr_frags == 0 && + skb_shinfo(tcp_write_queue_next(sk, skb))->nr_frags == 0) && + (tcp_skb_pcount(skb) == 1 && + tcp_skb_pcount(tcp_write_queue_next(sk, skb)) == 1) && (sysctl_tcp_retrans_collapse != 0)) tcp_retrans_try_collapse(sk, skb, cur_mss); @@ -1878,12 +1884,10 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) && tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) { if (!pskb_trim(skb, 0)) { - TCP_SKB_CB(skb)->seq = TCP_SKB_CB(skb)->end_seq - 1; - skb_shinfo(skb)->gso_segs = 1; - skb_shinfo(skb)->gso_size = 0; - skb_shinfo(skb)->gso_type = 0; + /* Reuse, even though it does some unnecessary work */ + tcp_init_nondata_skb(skb, TCP_SKB_CB(skb)->end_seq - 1, + TCP_SKB_CB(skb)->flags); skb->ip_summed = CHECKSUM_NONE; - skb->csum = 0; } } @@ -1901,7 +1905,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) tp->total_retrans++; #if FASTRETRANS_DEBUG > 0 - if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) { + if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { if (net_ratelimit()) printk(KERN_DEBUG "retrans_out leaked.\n"); } @@ -1943,7 +1947,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk) if (tp->retransmit_skb_hint) { skb = tp->retransmit_skb_hint; packet_cnt = tp->retransmit_cnt_hint; - }else{ + } else { skb = tcp_write_queue_head(sk); packet_cnt = 0; } @@ -1970,7 +1974,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk) return; if (sacked & TCPCB_LOST) { - if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) { + if (!(sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) { if (tcp_retransmit_skb(sk, skb)) { tp->retransmit_skb_hint = NULL; return; @@ -2028,7 +2032,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk) break; tp->forward_skb_hint = skb; - if (after(TCP_SKB_CB(skb)->seq, tp->highest_sack)) + if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp))) break; if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) @@ -2052,7 +2056,6 @@ void tcp_xmit_retransmit_queue(struct sock *sk) } } - /* Send a fin. The caller locks the socket for us. This cannot be * allowed to fail queueing a FIN frame under any circumstances. */ @@ -2083,16 +2086,9 @@ void tcp_send_fin(struct sock *sk) /* Reserve space for headers and prepare control bits. */ skb_reserve(skb, MAX_TCP_HEADER); - skb->csum = 0; - TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_FIN); - TCP_SKB_CB(skb)->sacked = 0; - skb_shinfo(skb)->gso_segs = 1; - skb_shinfo(skb)->gso_size = 0; - skb_shinfo(skb)->gso_type = 0; - /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ - TCP_SKB_CB(skb)->seq = tp->write_seq; - TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1; + tcp_init_nondata_skb(skb, tp->write_seq, + TCPCB_FLAG_ACK | TCPCB_FLAG_FIN); tcp_queue_skb(sk, skb); } __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF); @@ -2116,16 +2112,9 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) /* Reserve space for headers and prepare control bits. */ skb_reserve(skb, MAX_TCP_HEADER); - skb->csum = 0; - TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_RST); - TCP_SKB_CB(skb)->sacked = 0; - skb_shinfo(skb)->gso_segs = 1; - skb_shinfo(skb)->gso_size = 0; - skb_shinfo(skb)->gso_type = 0; - + tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk), + TCPCB_FLAG_ACK | TCPCB_FLAG_RST); /* Send it off. */ - TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk); - TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq; TCP_SKB_CB(skb)->when = tcp_time_stamp; if (tcp_transmit_skb(sk, skb, 0, priority)) NET_INC_STATS(LINUX_MIB_TCPABORTFAILED); @@ -2138,14 +2127,14 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) */ int tcp_send_synack(struct sock *sk) { - struct sk_buff* skb; + struct sk_buff *skb; skb = tcp_write_queue_head(sk); - if (skb == NULL || !(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_SYN)) { + if (skb == NULL || !(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN)) { printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n"); return -EFAULT; } - if (!(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_ACK)) { + if (!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_ACK)) { if (skb_cloned(skb)) { struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC); if (nskb == NULL) @@ -2153,8 +2142,9 @@ int tcp_send_synack(struct sock *sk) tcp_unlink_write_queue(skb, sk); skb_header_release(nskb); __tcp_add_write_queue_head(sk, nskb); - sk_stream_free_skb(sk, skb); - sk_charge_skb(sk, nskb); + sk_wmem_free_skb(sk, skb); + sk->sk_wmem_queued += nskb->truesize; + sk_mem_charge(sk, nskb->truesize); skb = nskb; } @@ -2168,8 +2158,8 @@ int tcp_send_synack(struct sock *sk) /* * Prepare a SYN-ACK. */ -struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, - struct request_sock *req) +struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, + struct request_sock *req) { struct inet_request_sock *ireq = inet_rsk(req); struct tcp_sock *tp = tcp_sk(sk); @@ -2212,12 +2202,11 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, TCP_ECN_make_synack(req, th); th->source = inet_sk(sk)->sport; th->dest = ireq->rmt_port; - TCP_SKB_CB(skb)->seq = tcp_rsk(req)->snt_isn; - TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1; - TCP_SKB_CB(skb)->sacked = 0; - skb_shinfo(skb)->gso_segs = 1; - skb_shinfo(skb)->gso_size = 0; - skb_shinfo(skb)->gso_type = 0; + /* Setting of flags are superfluous here for callers (and ECE is + * not even correctly set) + */ + tcp_init_nondata_skb(skb, tcp_rsk(req)->snt_isn, + TCPCB_FLAG_SYN | TCPCB_FLAG_ACK); th->seq = htonl(TCP_SKB_CB(skb)->seq); th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1); if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */ @@ -2249,7 +2238,6 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, NULL) ); - skb->csum = 0; th->doff = (tcp_header_size >> 2); TCP_INC_STATS(TCP_MIB_OUTSEGS); @@ -2341,23 +2329,17 @@ int tcp_connect(struct sock *sk) /* Reserve space for headers. */ skb_reserve(buff, MAX_TCP_HEADER); - TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN; - TCP_ECN_send_syn(sk, buff); - TCP_SKB_CB(buff)->sacked = 0; - skb_shinfo(buff)->gso_segs = 1; - skb_shinfo(buff)->gso_size = 0; - skb_shinfo(buff)->gso_type = 0; - buff->csum = 0; tp->snd_nxt = tp->write_seq; - TCP_SKB_CB(buff)->seq = tp->write_seq++; - TCP_SKB_CB(buff)->end_seq = tp->write_seq; + tcp_init_nondata_skb(buff, tp->write_seq++, TCPCB_FLAG_SYN); + TCP_ECN_send_syn(sk, buff); /* Send it off. */ TCP_SKB_CB(buff)->when = tcp_time_stamp; tp->retrans_stamp = TCP_SKB_CB(buff)->when; skb_header_release(buff); __tcp_add_write_queue_tail(sk, buff); - sk_charge_skb(sk, buff); + sk->sk_wmem_queued += buff->truesize; + sk_mem_charge(sk, buff->truesize); tp->packets_out += tcp_skb_pcount(buff); tcp_transmit_skb(sk, buff, 1, GFP_KERNEL); @@ -2386,9 +2368,10 @@ void tcp_send_delayed_ack(struct sock *sk) if (ato > TCP_DELACK_MIN) { const struct tcp_sock *tp = tcp_sk(sk); - int max_ato = HZ/2; + int max_ato = HZ / 2; - if (icsk->icsk_ack.pingpong || (icsk->icsk_ack.pending & ICSK_ACK_PUSHED)) + if (icsk->icsk_ack.pingpong || + (icsk->icsk_ack.pending & ICSK_ACK_PUSHED)) max_ato = TCP_DELACK_MAX; /* Slow path, intersegment interval is "high". */ @@ -2398,7 +2381,7 @@ void tcp_send_delayed_ack(struct sock *sk) * directly. */ if (tp->srtt) { - int rtt = max(tp->srtt>>3, TCP_DELACK_MIN); + int rtt = max(tp->srtt >> 3, TCP_DELACK_MIN); if (rtt < max_ato) max_ato = rtt; @@ -2432,37 +2415,32 @@ void tcp_send_delayed_ack(struct sock *sk) /* This routine sends an ack and also updates the window. */ void tcp_send_ack(struct sock *sk) { - /* If we have been reset, we may not send again. */ - if (sk->sk_state != TCP_CLOSE) { - struct sk_buff *buff; + struct sk_buff *buff; - /* We are not putting this on the write queue, so - * tcp_transmit_skb() will set the ownership to this - * sock. - */ - buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); - if (buff == NULL) { - inet_csk_schedule_ack(sk); - inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN; - inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, - TCP_DELACK_MAX, TCP_RTO_MAX); - return; - } + /* If we have been reset, we may not send again. */ + if (sk->sk_state == TCP_CLOSE) + return; - /* Reserve space for headers and prepare control bits. */ - skb_reserve(buff, MAX_TCP_HEADER); - buff->csum = 0; - TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK; - TCP_SKB_CB(buff)->sacked = 0; - skb_shinfo(buff)->gso_segs = 1; - skb_shinfo(buff)->gso_size = 0; - skb_shinfo(buff)->gso_type = 0; - - /* Send it off, this clears delayed acks for us. */ - TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk); - TCP_SKB_CB(buff)->when = tcp_time_stamp; - tcp_transmit_skb(sk, buff, 0, GFP_ATOMIC); + /* We are not putting this on the write queue, so + * tcp_transmit_skb() will set the ownership to this + * sock. + */ + buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); + if (buff == NULL) { + inet_csk_schedule_ack(sk); + inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN; + inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, + TCP_DELACK_MAX, TCP_RTO_MAX); + return; } + + /* Reserve space for headers and prepare control bits. */ + skb_reserve(buff, MAX_TCP_HEADER); + tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPCB_FLAG_ACK); + + /* Send it off, this clears delayed acks for us. */ + TCP_SKB_CB(buff)->when = tcp_time_stamp; + tcp_transmit_skb(sk, buff, 0, GFP_ATOMIC); } /* This routine sends a packet with an out of date sequence @@ -2488,66 +2466,57 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent) /* Reserve space for headers and set control bits. */ skb_reserve(skb, MAX_TCP_HEADER); - skb->csum = 0; - TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK; - TCP_SKB_CB(skb)->sacked = urgent; - skb_shinfo(skb)->gso_segs = 1; - skb_shinfo(skb)->gso_size = 0; - skb_shinfo(skb)->gso_type = 0; - /* Use a previous sequence. This should cause the other * end to send an ack. Don't queue or clone SKB, just * send it. */ - TCP_SKB_CB(skb)->seq = urgent ? tp->snd_una : tp->snd_una - 1; - TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq; + tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPCB_FLAG_ACK); TCP_SKB_CB(skb)->when = tcp_time_stamp; return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC); } int tcp_write_wakeup(struct sock *sk) { - if (sk->sk_state != TCP_CLOSE) { - struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb; - - if ((skb = tcp_send_head(sk)) != NULL && - before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)) { - int err; - unsigned int mss = tcp_current_mss(sk, 0); - unsigned int seg_size = tp->snd_una+tp->snd_wnd-TCP_SKB_CB(skb)->seq; - - if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq)) - tp->pushed_seq = TCP_SKB_CB(skb)->end_seq; - - /* We are probing the opening of a window - * but the window size is != 0 - * must have been a result SWS avoidance ( sender ) - */ - if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq || - skb->len > mss) { - seg_size = min(seg_size, mss); - TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; - if (tcp_fragment(sk, skb, seg_size, mss)) - return -1; - } else if (!tcp_skb_pcount(skb)) - tcp_set_skb_tso_segs(sk, skb, mss); + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + if (sk->sk_state == TCP_CLOSE) + return -1; + + if ((skb = tcp_send_head(sk)) != NULL && + before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) { + int err; + unsigned int mss = tcp_current_mss(sk, 0); + unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; + + if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq)) + tp->pushed_seq = TCP_SKB_CB(skb)->end_seq; + + /* We are probing the opening of a window + * but the window size is != 0 + * must have been a result SWS avoidance ( sender ) + */ + if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq || + skb->len > mss) { + seg_size = min(seg_size, mss); TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; - TCP_SKB_CB(skb)->when = tcp_time_stamp; - err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); - if (!err) { - update_send_head(sk, skb); - } - return err; - } else { - if (tp->urg_mode && - between(tp->snd_up, tp->snd_una+1, tp->snd_una+0xFFFF)) - tcp_xmit_probe_skb(sk, TCPCB_URG); - return tcp_xmit_probe_skb(sk, 0); - } + if (tcp_fragment(sk, skb, seg_size, mss)) + return -1; + } else if (!tcp_skb_pcount(skb)) + tcp_set_skb_tso_segs(sk, skb, mss); + + TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; + TCP_SKB_CB(skb)->when = tcp_time_stamp; + err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); + if (!err) + tcp_event_new_data_sent(sk, skb); + return err; + } else { + if (tp->urg_mode && + between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF)) + tcp_xmit_probe_skb(sk, 1); + return tcp_xmit_probe_skb(sk, 0); } - return -1; } /* A window probe timeout has occurred. If window is not closed send diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c index be27a33a1c68..2747ec7bfb63 100644 --- a/net/ipv4/tcp_scalable.c +++ b/net/ipv4/tcp_scalable.c @@ -15,8 +15,7 @@ #define TCP_SCALABLE_AI_CNT 50U #define TCP_SCALABLE_MD_SCALE 3 -static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, - u32 in_flight, int flag) +static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) { struct tcp_sock *tp = tcp_sk(sk); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index d8970ecfcfc8..803d758a2b12 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -114,13 +114,31 @@ static int tcp_orphan_retries(struct sock *sk, int alive) return retries; } +static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) +{ + /* Black hole detection */ + if (sysctl_tcp_mtu_probing) { + if (!icsk->icsk_mtup.enabled) { + icsk->icsk_mtup.enabled = 1; + tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); + } else { + struct tcp_sock *tp = tcp_sk(sk); + int mss; + + mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1; + mss = min(sysctl_tcp_base_mss, mss); + mss = max(mss, 68 - tp->tcp_header_len); + icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss); + tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); + } + } +} + /* A write timeout has occurred. Process the after effects. */ static int tcp_write_timeout(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); - struct tcp_sock *tp = tcp_sk(sk); int retry_until; - int mss; if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { if (icsk->icsk_retransmits) @@ -129,18 +147,7 @@ static int tcp_write_timeout(struct sock *sk) } else { if (icsk->icsk_retransmits >= sysctl_tcp_retries1) { /* Black hole detection */ - if (sysctl_tcp_mtu_probing) { - if (!icsk->icsk_mtup.enabled) { - icsk->icsk_mtup.enabled = 1; - tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); - } else { - mss = min(sysctl_tcp_base_mss, - tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low)/2); - mss = max(mss, 68 - tp->tcp_header_len); - icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss); - tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); - } - } + tcp_mtu_probing(icsk, sk); dst_negative_advice(&sk->sk_dst_cache); } @@ -179,7 +186,7 @@ static void tcp_delack_timer(unsigned long data) goto out_unlock; } - sk_stream_mem_reclaim(sk); + sk_mem_reclaim_partial(sk); if (sk->sk_state == TCP_CLOSE || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER)) goto out; @@ -219,7 +226,7 @@ static void tcp_delack_timer(unsigned long data) out: if (tcp_memory_pressure) - sk_stream_mem_reclaim(sk); + sk_mem_reclaim(sk); out_unlock: bh_unlock_sock(sk); sock_put(sk); @@ -413,7 +420,7 @@ static void tcp_write_timer(unsigned long data) TCP_CHECK_TIMER(sk); out: - sk_stream_mem_reclaim(sk); + sk_mem_reclaim(sk); out_unlock: bh_unlock_sock(sk); sock_put(sk); @@ -507,7 +514,7 @@ static void tcp_keepalive_timer (unsigned long data) } TCP_CHECK_TIMER(sk); - sk_stream_mem_reclaim(sk); + sk_mem_reclaim(sk); resched: inet_csk_reset_keepalive_timer (sk, elapsed); diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index 007304e99842..be24d6ee34bd 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -162,14 +162,13 @@ void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event) } EXPORT_SYMBOL_GPL(tcp_vegas_cwnd_event); -static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, - u32 in_flight, int flag) +static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) { struct tcp_sock *tp = tcp_sk(sk); struct vegas *vegas = inet_csk_ca(sk); if (!vegas->doing_vegas_now) - return tcp_reno_cong_avoid(sk, ack, in_flight, flag); + return tcp_reno_cong_avoid(sk, ack, in_flight); /* The key players are v_beg_snd_una and v_beg_snd_nxt. * @@ -228,7 +227,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, /* We don't have enough RTT samples to do the Vegas * calculation, so we'll behave like Reno. */ - tcp_reno_cong_avoid(sk, ack, in_flight, flag); + tcp_reno_cong_avoid(sk, ack, in_flight); } else { u32 rtt, target_cwnd, diff; diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c index 8fb2aee0b1a4..d16689e98516 100644 --- a/net/ipv4/tcp_veno.c +++ b/net/ipv4/tcp_veno.c @@ -114,14 +114,13 @@ static void tcp_veno_cwnd_event(struct sock *sk, enum tcp_ca_event event) tcp_veno_init(sk); } -static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, - u32 in_flight, int flag) +static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) { struct tcp_sock *tp = tcp_sk(sk); struct veno *veno = inet_csk_ca(sk); if (!veno->doing_veno_now) - return tcp_reno_cong_avoid(sk, ack, in_flight, flag); + return tcp_reno_cong_avoid(sk, ack, in_flight); /* limited by applications */ if (!tcp_is_cwnd_limited(sk, in_flight)) @@ -132,7 +131,7 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, /* We don't have enough rtt samples to do the Veno * calculation, so we'll behave like Reno. */ - tcp_reno_cong_avoid(sk, ack, in_flight, flag); + tcp_reno_cong_avoid(sk, ack, in_flight); } else { u32 rtt, target_cwnd; diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c index c107fba7430e..e03b10183a8b 100644 --- a/net/ipv4/tcp_yeah.c +++ b/net/ipv4/tcp_yeah.c @@ -69,8 +69,7 @@ static void tcp_yeah_pkts_acked(struct sock *sk, u32 pkts_acked, s32 rtt_us) tcp_vegas_pkts_acked(sk, pkts_acked, rtt_us); } -static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, - u32 in_flight, int flag) +static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) { struct tcp_sock *tp = tcp_sk(sk); struct yeah *yeah = inet_csk_ca(sk); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 03c400ca14c5..2fb8d731026b 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -82,6 +82,7 @@ #include <asm/system.h> #include <asm/uaccess.h> #include <asm/ioctls.h> +#include <linux/bootmem.h> #include <linux/types.h> #include <linux/fcntl.h> #include <linux/module.h> @@ -110,10 +111,25 @@ */ DEFINE_SNMP_STAT(struct udp_mib, udp_statistics) __read_mostly; +EXPORT_SYMBOL(udp_statistics); + +DEFINE_SNMP_STAT(struct udp_mib, udp_stats_in6) __read_mostly; +EXPORT_SYMBOL(udp_stats_in6); struct hlist_head udp_hash[UDP_HTABLE_SIZE]; DEFINE_RWLOCK(udp_hash_lock); +int sysctl_udp_mem[3] __read_mostly; +int sysctl_udp_rmem_min __read_mostly; +int sysctl_udp_wmem_min __read_mostly; + +EXPORT_SYMBOL(sysctl_udp_mem); +EXPORT_SYMBOL(sysctl_udp_rmem_min); +EXPORT_SYMBOL(sysctl_udp_wmem_min); + +atomic_t udp_memory_allocated; +EXPORT_SYMBOL(udp_memory_allocated); + static inline int __udp_lib_lport_inuse(__u16 num, const struct hlist_head udptable[]) { @@ -214,7 +230,7 @@ gotit: if (sk_unhashed(sk)) { head = &udptable[snum & (UDP_HTABLE_SIZE - 1)]; sk_add_node(sk, head); - sock_prot_inc_use(sk->sk_prot); + sock_prot_inuse_add(sk->sk_prot, 1); } error = 0; fail: @@ -402,7 +418,7 @@ out: void udp_err(struct sk_buff *skb, u32 info) { - return __udp4_lib_err(skb, info, udp_hash); + __udp4_lib_err(skb, info, udp_hash); } /* @@ -471,6 +487,7 @@ static int udp_push_pending_frames(struct sock *sk) struct sk_buff *skb; struct udphdr *uh; int err = 0; + int is_udplite = IS_UDPLITE(sk); __wsum csum = 0; /* Grab the skbuff where UDP header space exists. */ @@ -486,7 +503,7 @@ static int udp_push_pending_frames(struct sock *sk) uh->len = htons(up->len); uh->check = 0; - if (up->pcflag) /* UDP-Lite */ + if (is_udplite) /* UDP-Lite */ csum = udplite_csum_outgoing(sk, skb); else if (sk->sk_no_check == UDP_CSUM_NOXMIT) { /* UDP csum disabled */ @@ -514,7 +531,7 @@ out: up->len = 0; up->pending = 0; if (!err) - UDP_INC_STATS_USER(UDP_MIB_OUTDATAGRAMS, up->pcflag); + UDP_INC_STATS_USER(UDP_MIB_OUTDATAGRAMS, is_udplite); return err; } @@ -531,7 +548,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, __be32 daddr, faddr, saddr; __be16 dport; u8 tos; - int err, is_udplite = up->pcflag; + int err, is_udplite = IS_UDPLITE(sk); int corkreq = up->corkflag || msg->msg_flags&MSG_MORE; int (*getfrag)(void *, char *, int, int, int, struct sk_buff *); @@ -621,7 +638,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, connected = 0; } - if (MULTICAST(daddr)) { + if (ipv4_is_multicast(daddr)) { if (!ipc.oif) ipc.oif = inet->mc_index; if (!saddr) @@ -643,7 +660,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, { .sport = inet->sport, .dport = dport } } }; security_sk_classify_flow(sk, &fl); - err = ip_route_output_flow(&rt, &fl, sk, 1); + err = ip_route_output_flow(&init_net, &rt, &fl, sk, 1); if (err) { if (err == -ENETUNREACH) IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); @@ -825,6 +842,7 @@ int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name; struct sk_buff *skb; unsigned int ulen, copied; + int peeked; int err; int is_udplite = IS_UDPLITE(sk); @@ -838,7 +856,8 @@ int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, return ip_recv_error(sk, msg, len); try_again: - skb = skb_recv_datagram(sk, flags, noblock, &err); + skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0), + &peeked, &err); if (!skb) goto out; @@ -873,6 +892,9 @@ try_again: if (err) goto out_free; + if (!peeked) + UDP_INC_STATS_USER(UDP_MIB_INDATAGRAMS, is_udplite); + sock_recv_timestamp(msg, sk, skb); /* Copy the address. */ @@ -891,14 +913,17 @@ try_again: err = ulen; out_free: + lock_sock(sk); skb_free_datagram(sk, skb); + release_sock(sk); out: return err; csum_copy_err: - UDP_INC_STATS_BH(UDP_MIB_INERRORS, is_udplite); - - skb_kill_datagram(sk, skb, flags); + lock_sock(sk); + if (!skb_kill_datagram(sk, skb, flags)) + UDP_INC_STATS_USER(UDP_MIB_INERRORS, is_udplite); + release_sock(sk); if (noblock) return -EAGAIN; @@ -940,6 +965,7 @@ int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) { struct udp_sock *up = udp_sk(sk); int rc; + int is_udplite = IS_UDPLITE(sk); /* * Charge it to the socket, dropping if the queue is full. @@ -967,7 +993,8 @@ int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) ret = (*up->encap_rcv)(sk, skb); if (ret <= 0) { - UDP_INC_STATS_BH(UDP_MIB_INDATAGRAMS, up->pcflag); + UDP_INC_STATS_BH(UDP_MIB_INDATAGRAMS, + is_udplite); return -ret; } } @@ -978,7 +1005,7 @@ int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) /* * UDP-Lite specific tests, ignored on UDP sockets */ - if ((up->pcflag & UDPLITE_RECV_CC) && UDP_SKB_CB(skb)->partial_cov) { + if ((is_udplite & UDPLITE_RECV_CC) && UDP_SKB_CB(skb)->partial_cov) { /* * MIB statistics other than incrementing the error count are @@ -1019,15 +1046,14 @@ int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) if ((rc = sock_queue_rcv_skb(sk,skb)) < 0) { /* Note that an ENOMEM error is charged twice */ if (rc == -ENOMEM) - UDP_INC_STATS_BH(UDP_MIB_RCVBUFERRORS, up->pcflag); + UDP_INC_STATS_BH(UDP_MIB_RCVBUFERRORS, is_udplite); goto drop; } - UDP_INC_STATS_BH(UDP_MIB_INDATAGRAMS, up->pcflag); return 0; drop: - UDP_INC_STATS_BH(UDP_MIB_INERRORS, up->pcflag); + UDP_INC_STATS_BH(UDP_MIB_INERRORS, is_udplite); kfree_skb(skb); return -1; } @@ -1062,7 +1088,15 @@ static int __udp4_lib_mcast_deliver(struct sk_buff *skb, skb1 = skb_clone(skb, GFP_ATOMIC); if (skb1) { - int ret = udp_queue_rcv_skb(sk, skb1); + int ret = 0; + + bh_lock_sock_nested(sk); + if (!sock_owned_by_user(sk)) + ret = udp_queue_rcv_skb(sk, skb1); + else + sk_add_backlog(sk, skb1); + bh_unlock_sock(sk); + if (ret > 0) /* we should probably re-process instead * of dropping packets here. */ @@ -1155,7 +1189,13 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct hlist_head udptable[], inet_iif(skb), udptable); if (sk != NULL) { - int ret = udp_queue_rcv_skb(sk, skb); + int ret = 0; + bh_lock_sock_nested(sk); + if (!sock_owned_by_user(sk)) + ret = udp_queue_rcv_skb(sk, skb); + else + sk_add_backlog(sk, skb); + bh_unlock_sock(sk); sock_put(sk); /* a return value > 0 means to resubmit the input, but @@ -1236,6 +1276,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, struct udp_sock *up = udp_sk(sk); int val; int err = 0; + int is_udplite = IS_UDPLITE(sk); if (optlen<sizeof(int)) return -EINVAL; @@ -1277,7 +1318,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, /* The sender sets actual checksum coverage length via this option. * The case coverage > packet length is handled by send module. */ case UDPLITE_SEND_CSCOV: - if (!up->pcflag) /* Disable the option on UDP sockets */ + if (!is_udplite) /* Disable the option on UDP sockets */ return -ENOPROTOOPT; if (val != 0 && val < 8) /* Illegal coverage: use default (8) */ val = 8; @@ -1289,7 +1330,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, * sense, this should be set to at least 8 (as done below). If zero is * used, this again means full checksum coverage. */ case UDPLITE_RECV_CSCOV: - if (!up->pcflag) /* Disable the option on UDP sockets */ + if (!is_udplite) /* Disable the option on UDP sockets */ return -ENOPROTOOPT; if (val != 0 && val < 8) /* Avoid silly minimal values. */ val = 8; @@ -1449,6 +1490,10 @@ struct proto udp_prot = { .hash = udp_lib_hash, .unhash = udp_lib_unhash, .get_port = udp_v4_get_port, + .memory_allocated = &udp_memory_allocated, + .sysctl_mem = sysctl_udp_mem, + .sysctl_wmem = &sysctl_udp_wmem_min, + .sysctl_rmem = &sysctl_udp_rmem_min, .obj_size = sizeof(struct udp_sock), #ifdef CONFIG_COMPAT .compat_setsockopt = compat_udp_setsockopt, @@ -1505,6 +1550,7 @@ static struct sock *udp_get_idx(struct seq_file *seq, loff_t pos) } static void *udp_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(udp_hash_lock) { read_lock(&udp_hash_lock); return *pos ? udp_get_idx(seq, *pos-1) : (void *)1; @@ -1524,6 +1570,7 @@ static void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos) } static void udp_seq_stop(struct seq_file *seq, void *v) + __releases(udp_hash_lock) { read_unlock(&udp_hash_lock); } @@ -1644,6 +1691,25 @@ void udp4_proc_exit(void) } #endif /* CONFIG_PROC_FS */ +void __init udp_init(void) +{ + unsigned long limit; + + /* Set the pressure threshold up by the same strategy of TCP. It is a + * fraction of global memory that is up to 1/2 at 256 MB, decreasing + * toward zero with the amount of memory, with a floor of 128 pages. + */ + limit = min(nr_all_pages, 1UL<<(28-PAGE_SHIFT)) >> (20-PAGE_SHIFT); + limit = (limit * (nr_all_pages >> (20-PAGE_SHIFT))) >> (PAGE_SHIFT-11); + limit = max(limit, 128UL); + sysctl_udp_mem[0] = limit / 4 * 3; + sysctl_udp_mem[1] = limit; + sysctl_udp_mem[2] = sysctl_udp_mem[0] * 2; + + sysctl_udp_rmem_min = SK_MEM_QUANTUM; + sysctl_udp_wmem_min = SK_MEM_QUANTUM; +} + EXPORT_SYMBOL(udp_disconnect); EXPORT_SYMBOL(udp_hash); EXPORT_SYMBOL(udp_hash_lock); diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index f5baeb3e8b85..001b881ca36f 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -35,7 +35,7 @@ static int udplite_rcv(struct sk_buff *skb) static void udplite_err(struct sk_buff *skb, u32 info) { - return __udp4_lib_err(skb, info, udplite_hash); + __udp4_lib_err(skb, info, udplite_hash); } static struct net_protocol udplite_protocol = { diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index 5e95c8a07efb..390dcb1354a5 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c @@ -16,7 +16,11 @@ #include <net/ip.h> #include <net/xfrm.h> -#ifdef CONFIG_NETFILTER +int xfrm4_extract_input(struct xfrm_state *x, struct sk_buff *skb) +{ + return xfrm4_extract_header(skb); +} + static inline int xfrm4_rcv_encap_finish(struct sk_buff *skb) { if (skb->dst == NULL) { @@ -31,129 +35,35 @@ drop: kfree_skb(skb); return NET_RX_DROP; } -#endif int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) { - int err; - __be32 seq; - struct xfrm_state *xfrm_vec[XFRM_MAX_DEPTH]; - struct xfrm_state *x; - int xfrm_nr = 0; - int decaps = 0; - unsigned int nhoff = offsetof(struct iphdr, protocol); - - seq = 0; - if (!spi && (err = xfrm_parse_spi(skb, nexthdr, &spi, &seq)) != 0) - goto drop; - - do { - const struct iphdr *iph = ip_hdr(skb); - - if (xfrm_nr == XFRM_MAX_DEPTH) - goto drop; - - x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, spi, - nexthdr, AF_INET); - if (x == NULL) - goto drop; - - spin_lock(&x->lock); - if (unlikely(x->km.state != XFRM_STATE_VALID)) - goto drop_unlock; - - if ((x->encap ? x->encap->encap_type : 0) != encap_type) - goto drop_unlock; - - if (x->props.replay_window && xfrm_replay_check(x, seq)) - goto drop_unlock; - - if (xfrm_state_check_expire(x)) - goto drop_unlock; - - nexthdr = x->type->input(x, skb); - if (nexthdr <= 0) - goto drop_unlock; - - skb_network_header(skb)[nhoff] = nexthdr; - - /* only the first xfrm gets the encap type */ - encap_type = 0; - - if (x->props.replay_window) - xfrm_replay_advance(x, seq); - - x->curlft.bytes += skb->len; - x->curlft.packets++; - - spin_unlock(&x->lock); - - xfrm_vec[xfrm_nr++] = x; - - if (x->outer_mode->input(x, skb)) - goto drop; - - if (x->outer_mode->flags & XFRM_MODE_FLAG_TUNNEL) { - decaps = 1; - break; - } - - err = xfrm_parse_spi(skb, nexthdr, &spi, &seq); - if (err < 0) - goto drop; - } while (!err); - - /* Allocate new secpath or COW existing one. */ - - if (!skb->sp || atomic_read(&skb->sp->refcnt) != 1) { - struct sec_path *sp; - sp = secpath_dup(skb->sp); - if (!sp) - goto drop; - if (skb->sp) - secpath_put(skb->sp); - skb->sp = sp; - } - if (xfrm_nr + skb->sp->len > XFRM_MAX_DEPTH) - goto drop; - - memcpy(skb->sp->xvec + skb->sp->len, xfrm_vec, - xfrm_nr * sizeof(xfrm_vec[0])); - skb->sp->len += xfrm_nr; + XFRM_SPI_SKB_CB(skb)->family = AF_INET; + XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); + return xfrm_input(skb, nexthdr, spi, encap_type); +} +EXPORT_SYMBOL(xfrm4_rcv_encap); - nf_reset(skb); +int xfrm4_transport_finish(struct sk_buff *skb, int async) +{ + struct iphdr *iph = ip_hdr(skb); - if (decaps) { - dst_release(skb->dst); - skb->dst = NULL; - netif_rx(skb); - return 0; - } else { -#ifdef CONFIG_NETFILTER - __skb_push(skb, skb->data - skb_network_header(skb)); - ip_hdr(skb)->tot_len = htons(skb->len); - ip_send_check(ip_hdr(skb)); + iph->protocol = XFRM_MODE_SKB_CB(skb)->protocol; - NF_HOOK(PF_INET, NF_IP_PRE_ROUTING, skb, skb->dev, NULL, - xfrm4_rcv_encap_finish); - return 0; -#else - return -ip_hdr(skb)->protocol; +#ifndef CONFIG_NETFILTER + if (!async) + return -iph->protocol; #endif - } -drop_unlock: - spin_unlock(&x->lock); - xfrm_state_put(x); -drop: - while (--xfrm_nr >= 0) - xfrm_state_put(xfrm_vec[xfrm_nr]); + __skb_push(skb, skb->data - skb_network_header(skb)); + iph->tot_len = htons(skb->len); + ip_send_check(iph); - kfree_skb(skb); + NF_HOOK(PF_INET, NF_INET_PRE_ROUTING, skb, skb->dev, NULL, + xfrm4_rcv_encap_finish); return 0; } -EXPORT_SYMBOL(xfrm4_rcv_encap); /* If it's a keepalive packet, then just eat it. * If it's an encapsulated packet, then pass it to the diff --git a/net/ipv4/xfrm4_mode_beet.c b/net/ipv4/xfrm4_mode_beet.c index e42e122414be..e093a7b59e18 100644 --- a/net/ipv4/xfrm4_mode_beet.c +++ b/net/ipv4/xfrm4_mode_beet.c @@ -17,6 +17,21 @@ #include <net/ip.h> #include <net/xfrm.h> +static void xfrm4_beet_make_header(struct sk_buff *skb) +{ + struct iphdr *iph = ip_hdr(skb); + + iph->ihl = 5; + iph->version = 4; + + iph->protocol = XFRM_MODE_SKB_CB(skb)->protocol; + iph->tos = XFRM_MODE_SKB_CB(skb)->tos; + + iph->id = XFRM_MODE_SKB_CB(skb)->id; + iph->frag_off = XFRM_MODE_SKB_CB(skb)->frag_off; + iph->ttl = XFRM_MODE_SKB_CB(skb)->ttl; +} + /* Add encapsulation header. * * The top IP header will be constructed per draft-nikander-esp-beet-mode-06.txt. @@ -40,10 +55,12 @@ static int xfrm4_beet_output(struct xfrm_state *x, struct sk_buff *skb) offsetof(struct iphdr, protocol); skb->transport_header = skb->network_header + sizeof(*iph); + xfrm4_beet_make_header(skb); + ph = (struct ip_beet_phdr *)__skb_pull(skb, sizeof(*iph) - hdrlen); top_iph = ip_hdr(skb); - memmove(top_iph, iph, sizeof(*iph)); + if (unlikely(optlen)) { BUG_ON(optlen < 0); @@ -65,43 +82,46 @@ static int xfrm4_beet_output(struct xfrm_state *x, struct sk_buff *skb) static int xfrm4_beet_input(struct xfrm_state *x, struct sk_buff *skb) { - struct iphdr *iph = ip_hdr(skb); - int phlen = 0; + struct iphdr *iph; int optlen = 0; - u8 ph_nexthdr = 0; int err = -EINVAL; - if (unlikely(iph->protocol == IPPROTO_BEETPH)) { + if (unlikely(XFRM_MODE_SKB_CB(skb)->protocol == IPPROTO_BEETPH)) { struct ip_beet_phdr *ph; + int phlen; if (!pskb_may_pull(skb, sizeof(*ph))) goto out; - ph = (struct ip_beet_phdr *)(ipip_hdr(skb) + 1); + + ph = (struct ip_beet_phdr *)skb->data; phlen = sizeof(*ph) + ph->padlen; optlen = ph->hdrlen * 8 + (IPV4_BEET_PHMAXLEN - phlen); if (optlen < 0 || optlen & 3 || optlen > 250) goto out; - if (!pskb_may_pull(skb, phlen + optlen)) - goto out; - skb->len -= phlen + optlen; + XFRM_MODE_SKB_CB(skb)->protocol = ph->nexthdr; - ph_nexthdr = ph->nexthdr; + if (!pskb_may_pull(skb, phlen)); + goto out; + __skb_pull(skb, phlen); } - skb_set_network_header(skb, phlen - sizeof(*iph)); - memmove(skb_network_header(skb), iph, sizeof(*iph)); - skb_set_transport_header(skb, phlen + optlen); - skb->data = skb_transport_header(skb); + skb_push(skb, sizeof(*iph)); + skb_reset_network_header(skb); + + memmove(skb->data - skb->mac_len, skb_mac_header(skb), + skb->mac_len); + skb_set_mac_header(skb, -skb->mac_len); + + xfrm4_beet_make_header(skb); iph = ip_hdr(skb); - iph->ihl = (sizeof(*iph) + optlen) / 4; - iph->tot_len = htons(skb->len + iph->ihl * 4); + + iph->ihl += optlen / 4; + iph->tot_len = htons(skb->len); iph->daddr = x->sel.daddr.a4; iph->saddr = x->sel.saddr.a4; - if (ph_nexthdr) - iph->protocol = ph_nexthdr; iph->check = 0; iph->check = ip_fast_csum(skb_network_header(skb), iph->ihl); err = 0; @@ -110,8 +130,10 @@ out: } static struct xfrm_mode xfrm4_beet_mode = { - .input = xfrm4_beet_input, - .output = xfrm4_beet_output, + .input2 = xfrm4_beet_input, + .input = xfrm_prepare_input, + .output2 = xfrm4_beet_output, + .output = xfrm4_prepare_output, .owner = THIS_MODULE, .encap = XFRM_MODE_BEET, .flags = XFRM_MODE_FLAG_TUNNEL, diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c index e4deecba6dd2..8dee617ee900 100644 --- a/net/ipv4/xfrm4_mode_tunnel.c +++ b/net/ipv4/xfrm4_mode_tunnel.c @@ -16,92 +16,60 @@ static inline void ipip_ecn_decapsulate(struct sk_buff *skb) { - struct iphdr *outer_iph = ip_hdr(skb); struct iphdr *inner_iph = ipip_hdr(skb); - if (INET_ECN_is_ce(outer_iph->tos)) + if (INET_ECN_is_ce(XFRM_MODE_SKB_CB(skb)->tos)) IP_ECN_set_ce(inner_iph); } -static inline void ipip6_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb) -{ - if (INET_ECN_is_ce(iph->tos)) - IP6_ECN_set_ce(ipv6_hdr(skb)); -} - /* Add encapsulation header. * * The top IP header will be constructed per RFC 2401. */ -static int xfrm4_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) +static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) { struct dst_entry *dst = skb->dst; - struct xfrm_dst *xdst = (struct xfrm_dst*)dst; - struct iphdr *iph, *top_iph; + struct iphdr *top_iph; int flags; - iph = ip_hdr(skb); - skb_set_network_header(skb, -x->props.header_len); skb->mac_header = skb->network_header + offsetof(struct iphdr, protocol); - skb->transport_header = skb->network_header + sizeof(*iph); + skb->transport_header = skb->network_header + sizeof(*top_iph); top_iph = ip_hdr(skb); top_iph->ihl = 5; top_iph->version = 4; - flags = x->props.flags; + top_iph->protocol = x->inner_mode->afinfo->proto; /* DS disclosed */ - if (xdst->route->ops->family == AF_INET) { - top_iph->protocol = IPPROTO_IPIP; - top_iph->tos = INET_ECN_encapsulate(iph->tos, iph->tos); - top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ? - 0 : (iph->frag_off & htons(IP_DF)); - } -#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) - else { - struct ipv6hdr *ipv6h = (struct ipv6hdr*)iph; - top_iph->protocol = IPPROTO_IPV6; - top_iph->tos = INET_ECN_encapsulate(iph->tos, ipv6_get_dsfield(ipv6h)); - top_iph->frag_off = 0; - } -#endif + top_iph->tos = INET_ECN_encapsulate(XFRM_MODE_SKB_CB(skb)->tos, + XFRM_MODE_SKB_CB(skb)->tos); + flags = x->props.flags; if (flags & XFRM_STATE_NOECN) IP_ECN_clear(top_iph); - if (!top_iph->frag_off) - __ip_select_ident(top_iph, dst->child, 0); + top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ? + 0 : XFRM_MODE_SKB_CB(skb)->frag_off; + ip_select_ident(top_iph, dst->child, NULL); top_iph->ttl = dst_metric(dst->child, RTAX_HOPLIMIT); top_iph->saddr = x->props.saddr.a4; top_iph->daddr = x->id.daddr.a4; - skb->protocol = htons(ETH_P_IP); - - memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); return 0; } -static int xfrm4_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) +static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) { - struct iphdr *iph = ip_hdr(skb); const unsigned char *old_mac; int err = -EINVAL; - switch (iph->protocol){ - case IPPROTO_IPIP: - break; -#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) - case IPPROTO_IPV6: - break; -#endif - default: - goto out; - } + if (XFRM_MODE_SKB_CB(skb)->protocol != IPPROTO_IPIP) + goto out; if (!pskb_may_pull(skb, sizeof(struct iphdr))) goto out; @@ -110,20 +78,11 @@ static int xfrm4_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) goto out; - iph = ip_hdr(skb); - if (iph->protocol == IPPROTO_IPIP) { - if (x->props.flags & XFRM_STATE_DECAP_DSCP) - ipv4_copy_dscp(iph, ipip_hdr(skb)); - if (!(x->props.flags & XFRM_STATE_NOECN)) - ipip_ecn_decapsulate(skb); - } -#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) - else { - if (!(x->props.flags & XFRM_STATE_NOECN)) - ipip6_ecn_decapsulate(iph, skb); - skb->protocol = htons(ETH_P_IPV6); - } -#endif + if (x->props.flags & XFRM_STATE_DECAP_DSCP) + ipv4_copy_dscp(XFRM_MODE_SKB_CB(skb)->tos, ipip_hdr(skb)); + if (!(x->props.flags & XFRM_STATE_NOECN)) + ipip_ecn_decapsulate(skb); + old_mac = skb_mac_header(skb); skb_set_mac_header(skb, -skb->mac_len); memmove(skb_mac_header(skb), old_mac, skb->mac_len); @@ -135,19 +94,21 @@ out: } static struct xfrm_mode xfrm4_tunnel_mode = { - .input = xfrm4_tunnel_input, - .output = xfrm4_tunnel_output, + .input2 = xfrm4_mode_tunnel_input, + .input = xfrm_prepare_input, + .output2 = xfrm4_mode_tunnel_output, + .output = xfrm4_prepare_output, .owner = THIS_MODULE, .encap = XFRM_MODE_TUNNEL, .flags = XFRM_MODE_FLAG_TUNNEL, }; -static int __init xfrm4_tunnel_init(void) +static int __init xfrm4_mode_tunnel_init(void) { return xfrm_register_mode(&xfrm4_tunnel_mode, AF_INET); } -static void __exit xfrm4_tunnel_exit(void) +static void __exit xfrm4_mode_tunnel_exit(void) { int err; @@ -155,7 +116,7 @@ static void __exit xfrm4_tunnel_exit(void) BUG_ON(err); } -module_init(xfrm4_tunnel_init); -module_exit(xfrm4_tunnel_exit); +module_init(xfrm4_mode_tunnel_init); +module_exit(xfrm4_mode_tunnel_exit); MODULE_LICENSE("GPL"); MODULE_ALIAS_XFRM_MODE(AF_INET, XFRM_MODE_TUNNEL); diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index c4a7156962bd..d5a58a818021 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -8,11 +8,12 @@ * 2 of the License, or (at your option) any later version. */ -#include <linux/compiler.h> #include <linux/if_ether.h> #include <linux/kernel.h> +#include <linux/module.h> #include <linux/skbuff.h> #include <linux/netfilter_ipv4.h> +#include <net/dst.h> #include <net/ip.h> #include <net/xfrm.h> #include <net/icmp.h> @@ -25,8 +26,6 @@ static int xfrm4_tunnel_check_size(struct sk_buff *skb) if (IPCB(skb)->flags & IPSKB_XFRM_TUNNEL_SIZE) goto out; - IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE; - if (!(ip_hdr(skb)->frag_off & htons(IP_DF)) || skb->local_df) goto out; @@ -40,106 +39,54 @@ out: return ret; } -static inline int xfrm4_output_one(struct sk_buff *skb) +int xfrm4_extract_output(struct xfrm_state *x, struct sk_buff *skb) { - struct dst_entry *dst = skb->dst; - struct xfrm_state *x = dst->xfrm; - struct iphdr *iph; int err; - if (x->outer_mode->flags & XFRM_MODE_FLAG_TUNNEL) { - err = xfrm4_tunnel_check_size(skb); - if (err) - goto error_nolock; - } - - err = xfrm_output(skb); + err = xfrm4_tunnel_check_size(skb); if (err) - goto error_nolock; + return err; - iph = ip_hdr(skb); - iph->tot_len = htons(skb->len); - ip_send_check(iph); + XFRM_MODE_SKB_CB(skb)->protocol = ip_hdr(skb)->protocol; - IPCB(skb)->flags |= IPSKB_XFRM_TRANSFORMED; - err = 0; - -out_exit: - return err; -error_nolock: - kfree_skb(skb); - goto out_exit; + return xfrm4_extract_header(skb); } -static int xfrm4_output_finish2(struct sk_buff *skb) +int xfrm4_prepare_output(struct xfrm_state *x, struct sk_buff *skb) { int err; - while (likely((err = xfrm4_output_one(skb)) == 0)) { - nf_reset(skb); - - err = nf_hook(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, - skb->dst->dev, dst_output); - if (unlikely(err != 1)) - break; + err = x->inner_mode->afinfo->extract_output(x, skb); + if (err) + return err; - if (!skb->dst->xfrm) - return dst_output(skb); + memset(IPCB(skb), 0, sizeof(*IPCB(skb))); + IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED; - err = nf_hook(PF_INET, NF_IP_POST_ROUTING, skb, NULL, - skb->dst->dev, xfrm4_output_finish2); - if (unlikely(err != 1)) - break; - } + skb->protocol = htons(ETH_P_IP); - return err; + return x->outer_mode->output2(x, skb); } +EXPORT_SYMBOL(xfrm4_prepare_output); static int xfrm4_output_finish(struct sk_buff *skb) { - struct sk_buff *segs; - #ifdef CONFIG_NETFILTER if (!skb->dst->xfrm) { IPCB(skb)->flags |= IPSKB_REROUTED; return dst_output(skb); } -#endif - if (!skb_is_gso(skb)) - return xfrm4_output_finish2(skb); + IPCB(skb)->flags |= IPSKB_XFRM_TRANSFORMED; +#endif skb->protocol = htons(ETH_P_IP); - segs = skb_gso_segment(skb, 0); - kfree_skb(skb); - if (unlikely(IS_ERR(segs))) - return PTR_ERR(segs); - - do { - struct sk_buff *nskb = segs->next; - int err; - - segs->next = NULL; - err = xfrm4_output_finish2(segs); - - if (unlikely(err)) { - while ((segs = nskb)) { - nskb = segs->next; - segs->next = NULL; - kfree_skb(segs); - } - return err; - } - - segs = nskb; - } while (segs); - - return 0; + return xfrm_output(skb); } int xfrm4_output(struct sk_buff *skb) { - return NF_HOOK_COND(PF_INET, NF_IP_POST_ROUTING, skb, NULL, skb->dst->dev, - xfrm4_output_finish, + return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb, + NULL, skb->dst->dev, xfrm4_output_finish, !(IPCB(skb)->flags & IPSKB_REROUTED)); } diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index cc86fb110dd8..3783e3ee56a4 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -8,36 +8,54 @@ * */ -#include <linux/compiler.h> +#include <linux/err.h> +#include <linux/kernel.h> #include <linux/inetdevice.h> +#include <net/dst.h> #include <net/xfrm.h> #include <net/ip.h> static struct dst_ops xfrm4_dst_ops; static struct xfrm_policy_afinfo xfrm4_policy_afinfo; -static int xfrm4_dst_lookup(struct xfrm_dst **dst, struct flowi *fl) +static struct dst_entry *xfrm4_dst_lookup(int tos, xfrm_address_t *saddr, + xfrm_address_t *daddr) { - return __ip_route_output_key((struct rtable**)dst, fl); -} - -static int xfrm4_get_saddr(xfrm_address_t *saddr, xfrm_address_t *daddr) -{ - struct rtable *rt; - struct flowi fl_tunnel = { + struct flowi fl = { .nl_u = { .ip4_u = { + .tos = tos, .daddr = daddr->a4, }, }, }; + struct dst_entry *dst; + struct rtable *rt; + int err; - if (!xfrm4_dst_lookup((struct xfrm_dst **)&rt, &fl_tunnel)) { - saddr->a4 = rt->rt_src; - dst_release(&rt->u.dst); - return 0; - } - return -EHOSTUNREACH; + if (saddr) + fl.fl4_src = saddr->a4; + + err = __ip_route_output_key(&init_net, &rt, &fl); + dst = &rt->u.dst; + if (err) + dst = ERR_PTR(err); + return dst; +} + +static int xfrm4_get_saddr(xfrm_address_t *saddr, xfrm_address_t *daddr) +{ + struct dst_entry *dst; + struct rtable *rt; + + dst = xfrm4_dst_lookup(0, NULL, daddr); + if (IS_ERR(dst)) + return -EHOSTUNREACH; + + rt = (struct rtable *)dst; + saddr->a4 = rt->rt_src; + dst_release(dst); + return 0; } static struct dst_entry * @@ -61,142 +79,49 @@ __xfrm4_find_bundle(struct flowi *fl, struct xfrm_policy *policy) return dst; } -/* Allocate chain of dst_entry's, attach known xfrm's, calculate - * all the metrics... Shortly, bundle a bundle. - */ +static int xfrm4_get_tos(struct flowi *fl) +{ + return fl->fl4_tos; +} -static int -__xfrm4_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int nx, - struct flowi *fl, struct dst_entry **dst_p) +static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst, + int nfheader_len) { - struct dst_entry *dst, *dst_prev; - struct rtable *rt0 = (struct rtable*)(*dst_p); - struct rtable *rt = rt0; - struct flowi fl_tunnel = { - .nl_u = { - .ip4_u = { - .saddr = fl->fl4_src, - .daddr = fl->fl4_dst, - .tos = fl->fl4_tos - } - } - }; - int i; - int err; - int header_len = 0; - int trailer_len = 0; + return 0; +} - dst = dst_prev = NULL; - dst_hold(&rt->u.dst); +static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev) +{ + struct rtable *rt = (struct rtable *)xdst->route; - for (i = 0; i < nx; i++) { - struct dst_entry *dst1 = dst_alloc(&xfrm4_dst_ops); - struct xfrm_dst *xdst; + xdst->u.rt.fl = rt->fl; - if (unlikely(dst1 == NULL)) { - err = -ENOBUFS; - dst_release(&rt->u.dst); - goto error; - } + xdst->u.dst.dev = dev; + dev_hold(dev); - if (!dst) - dst = dst1; - else { - dst_prev->child = dst1; - dst1->flags |= DST_NOHASH; - dst_clone(dst1); - } + xdst->u.rt.idev = in_dev_get(dev); + if (!xdst->u.rt.idev) + return -ENODEV; - xdst = (struct xfrm_dst *)dst1; - xdst->route = &rt->u.dst; - xdst->genid = xfrm[i]->genid; - - dst1->next = dst_prev; - dst_prev = dst1; - - header_len += xfrm[i]->props.header_len; - trailer_len += xfrm[i]->props.trailer_len; - - if (xfrm[i]->props.mode != XFRM_MODE_TRANSPORT) { - unsigned short encap_family = xfrm[i]->props.family; - switch (encap_family) { - case AF_INET: - fl_tunnel.fl4_dst = xfrm[i]->id.daddr.a4; - fl_tunnel.fl4_src = xfrm[i]->props.saddr.a4; - break; -#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) - case AF_INET6: - ipv6_addr_copy(&fl_tunnel.fl6_dst, (struct in6_addr*)&xfrm[i]->id.daddr.a6); - ipv6_addr_copy(&fl_tunnel.fl6_src, (struct in6_addr*)&xfrm[i]->props.saddr.a6); - break; -#endif - default: - BUG_ON(1); - } - err = xfrm_dst_lookup((struct xfrm_dst **)&rt, - &fl_tunnel, encap_family); - if (err) - goto error; - } else - dst_hold(&rt->u.dst); - } + xdst->u.rt.peer = rt->peer; + if (rt->peer) + atomic_inc(&rt->peer->refcnt); - dst_prev->child = &rt->u.dst; - dst->path = &rt->u.dst; - - *dst_p = dst; - dst = dst_prev; - - dst_prev = *dst_p; - i = 0; - for (; dst_prev != &rt->u.dst; dst_prev = dst_prev->child) { - struct xfrm_dst *x = (struct xfrm_dst*)dst_prev; - x->u.rt.fl = *fl; - - dst_prev->xfrm = xfrm[i++]; - dst_prev->dev = rt->u.dst.dev; - if (rt->u.dst.dev) - dev_hold(rt->u.dst.dev); - dst_prev->obsolete = -1; - dst_prev->flags |= DST_HOST; - dst_prev->lastuse = jiffies; - dst_prev->header_len = header_len; - dst_prev->nfheader_len = 0; - dst_prev->trailer_len = trailer_len; - memcpy(&dst_prev->metrics, &x->route->metrics, sizeof(dst_prev->metrics)); - - /* Copy neighbout for reachability confirmation */ - dst_prev->neighbour = neigh_clone(rt->u.dst.neighbour); - dst_prev->input = rt->u.dst.input; - dst_prev->output = dst_prev->xfrm->outer_mode->afinfo->output; - if (rt0->peer) - atomic_inc(&rt0->peer->refcnt); - x->u.rt.peer = rt0->peer; - /* Sheit... I remember I did this right. Apparently, - * it was magically lost, so this code needs audit */ - x->u.rt.rt_flags = rt0->rt_flags&(RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL); - x->u.rt.rt_type = rt0->rt_type; - x->u.rt.rt_src = rt0->rt_src; - x->u.rt.rt_dst = rt0->rt_dst; - x->u.rt.rt_gateway = rt0->rt_gateway; - x->u.rt.rt_spec_dst = rt0->rt_spec_dst; - x->u.rt.idev = rt0->idev; - in_dev_hold(rt0->idev); - header_len -= x->u.dst.xfrm->props.header_len; - trailer_len -= x->u.dst.xfrm->props.trailer_len; - } + /* Sheit... I remember I did this right. Apparently, + * it was magically lost, so this code needs audit */ + xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | + RTCF_LOCAL); + xdst->u.rt.rt_type = rt->rt_type; + xdst->u.rt.rt_src = rt->rt_src; + xdst->u.rt.rt_dst = rt->rt_dst; + xdst->u.rt.rt_gateway = rt->rt_gateway; + xdst->u.rt.rt_spec_dst = rt->rt_spec_dst; - xfrm_init_pmtu(dst); return 0; - -error: - if (dst) - dst_free(dst); - return err; } static void -_decode_session4(struct sk_buff *skb, struct flowi *fl) +_decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) { struct iphdr *iph = ip_hdr(skb); u8 *xprth = skb_network_header(skb) + iph->ihl * 4; @@ -212,8 +137,8 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl) if (pskb_may_pull(skb, xprth + 4 - skb->data)) { __be16 *ports = (__be16 *)xprth; - fl->fl_ip_sport = ports[0]; - fl->fl_ip_dport = ports[1]; + fl->fl_ip_sport = ports[!!reverse]; + fl->fl_ip_dport = ports[!reverse]; } break; @@ -255,12 +180,12 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl) } } fl->proto = iph->protocol; - fl->fl4_dst = iph->daddr; - fl->fl4_src = iph->saddr; + fl->fl4_dst = reverse ? iph->saddr : iph->daddr; + fl->fl4_src = reverse ? iph->daddr : iph->saddr; fl->fl4_tos = iph->tos; } -static inline int xfrm4_garbage_collect(void) +static inline int xfrm4_garbage_collect(struct dst_ops *ops) { xfrm4_policy_afinfo.garbage_collect(); return (atomic_read(&xfrm4_dst_ops.entries) > xfrm4_dst_ops.gc_thresh*2); @@ -295,7 +220,8 @@ static void xfrm4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, xdst = (struct xfrm_dst *)dst; if (xdst->u.rt.idev->dev == dev) { - struct in_device *loopback_idev = in_dev_get(init_net.loopback_dev); + struct in_device *loopback_idev = + in_dev_get(dev->nd_net->loopback_dev); BUG_ON(!loopback_idev); do { @@ -318,6 +244,7 @@ static struct dst_ops xfrm4_dst_ops = { .update_pmtu = xfrm4_update_pmtu, .destroy = xfrm4_dst_destroy, .ifdown = xfrm4_dst_ifdown, + .local_out = __ip_local_out, .gc_thresh = 1024, .entry_size = sizeof(struct xfrm_dst), }; @@ -328,8 +255,10 @@ static struct xfrm_policy_afinfo xfrm4_policy_afinfo = { .dst_lookup = xfrm4_dst_lookup, .get_saddr = xfrm4_get_saddr, .find_bundle = __xfrm4_find_bundle, - .bundle_create = __xfrm4_bundle_create, .decode_session = _decode_session4, + .get_tos = xfrm4_get_tos, + .init_path = xfrm4_init_path, + .fill_dst = xfrm4_fill_dst, }; static void __init xfrm4_policy_init(void) diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c index 13d54a1c3337..fdeebe68a379 100644 --- a/net/ipv4/xfrm4_state.c +++ b/net/ipv4/xfrm4_state.c @@ -11,6 +11,7 @@ #include <net/xfrm.h> #include <linux/pfkeyv2.h> #include <linux/ipsec.h> +#include <linux/netfilter_ipv4.h> static struct xfrm_state_afinfo xfrm4_state_afinfo; @@ -47,12 +48,31 @@ __xfrm4_init_tempsel(struct xfrm_state *x, struct flowi *fl, x->props.family = AF_INET; } +int xfrm4_extract_header(struct sk_buff *skb) +{ + struct iphdr *iph = ip_hdr(skb); + + XFRM_MODE_SKB_CB(skb)->id = iph->id; + XFRM_MODE_SKB_CB(skb)->frag_off = iph->frag_off; + XFRM_MODE_SKB_CB(skb)->tos = iph->tos; + XFRM_MODE_SKB_CB(skb)->ttl = iph->ttl; + memset(XFRM_MODE_SKB_CB(skb)->flow_lbl, 0, + sizeof(XFRM_MODE_SKB_CB(skb)->flow_lbl)); + + return 0; +} + static struct xfrm_state_afinfo xfrm4_state_afinfo = { .family = AF_INET, + .proto = IPPROTO_IPIP, + .eth_proto = htons(ETH_P_IP), .owner = THIS_MODULE, .init_flags = xfrm4_init_flags, .init_tempsel = __xfrm4_init_tempsel, .output = xfrm4_output, + .extract_input = xfrm4_extract_input, + .extract_output = xfrm4_extract_output, + .transport_finish = xfrm4_transport_finish, }; void __init xfrm4_state_init(void) |